In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import jensenshannon
import ast

# Load histograms
df = pd.read_csv('grids_cluster_histograms.csv')


# Convert histograms into aligned probability vectors
all_keys = set()
for h in df['histogram']:
    hist_dict = ast.literal_eval(h.replace("=", ":"))
    all_keys.update(hist_dict.keys())

all_keys = sorted([int(k) for k in all_keys])
K = len(all_keys)

def normalize(hist_dict):
    vec = np.array([hist_dict.get(k, 0) for k in all_keys])
    if vec.sum() == 0:
        return np.zeros_like(vec)
    return vec / vec.sum()

df['vector'] = df['histogram'].apply(lambda h: normalize(eval(h.replace("=", ":"))))

# Compute full area distribution
full_dist = df['vector'].sum()
full_dist = full_dist / full_dist.sum()

# Greedy selection
selected = []
remaining = df.index.tolist()
p = 60

for _ in range(p):
    best_score = float('inf')
    best_idx = None

    for idx in remaining:
        subset = selected + [idx]
        combined = df.loc[subset, 'vector'].sum()
        combined = combined / combined.sum()

        score = jensenshannon(combined, full_dist)
        if score < best_score:
            best_score = score
            best_idx = idx

    selected.append(best_idx)
    remaining.remove(best_idx)

print("Selected tehsils:", df.loc[selected])

  from scipy.spatial.distance import jensenshannon


Selected tehsils:               system:index  grid_id  \
1036  0000000000000000040c     1846   
122   0000000000000000007a      485   
1839  0000000000000000072f     3531   
775   00000000000000000307     1507   
654   0000000000000000028e     1341   
1619  00000000000000000653     2881   
453   000000000000000001c5     1065   
334   0000000000000000014e      879   
1884  0000000000000000075c     3666   
1730  000000000000000006c2     3197   
947   000000000000000003b3     1724   
720   000000000000000002d0     1437   
179   000000000000000000b3      619   
1699  000000000000000006a3     3077   
1374  0000000000000000055e     2408   
26    0000000000000000001a      213   
953   000000000000000003b9     1730   
1779  000000000000000006f3     3336   
484   000000000000000001e4     1116   
1038  0000000000000000040e     1848   
76    0000000000000000004c      356   
1119  0000000000000000045f     1965   
1098  0000000000000000044a     1925   
1855  0000000000000000073f     3592   
1775  0

In [2]:
representative_grids = [i for i in list(df.loc[selected]["grid_id"])]

In [3]:
df_points = pd.read_csv("points.csv")

In [4]:
df_points = df_points[df_points["index"].isin(representative_grids)].reset_index(drop=True)
df_points["index"] = range(len(df_points))
df_points["overall_status"] = False
df_points["download_status"] = False
df_points["model_status"] = False
df_points["segmentation_status"] = False
df_points["postprocessing_status"] = False
df_points["plantation_status"] = False

In [5]:
df_points

Unnamed: 0,index,points,overall_status,download_status,model_status,segmentation_status,postprocessing_status,plantation_status
0,0,"((18.989414715239327, 78.81866455078125), (18....",False,False,False,False,False,False
1,1,"((18.989414715239327, 78.86260986328125), (18....",False,False,False,False,False,False
2,2,"((18.947855781294127, 78.81866455078125), (18....",False,False,False,False,False,False
3,3,"((18.947855781294127, 78.86260986328125), (18....",False,False,False,False,False,False
4,4,"((18.823116948090497, 78.55499267578125), (18....",False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
235,235,"((14.402759378194208, 79.03839111328125), (14....",False,False,False,False,False,False
236,236,"((14.36019115837039, 79.25811767578125), (14.3...",False,False,False,False,False,False
237,237,"((14.36019115837039, 79.30206298828125), (14.3...",False,False,False,False,False,False
238,238,"((14.317614840171906, 79.25811767578125), (14....",False,False,False,False,False,False


In [6]:
df_points.to_csv("status.csv", index=False)