In [26]:
import os
import numpy as np
import geopandas as gpd
from shapely.geometry import Polygon
import pandas as pd
def get_latlon_corners(center_lat, center_lon, points, img_size=128, gsd=10):
    latlons = []
    for x, y in points:
        dx = x - img_size / 2
        dy = y - img_size / 2
        dx_m = dx * gsd
        dy_m = dy * gsd
        dlat = dy_m / 111320
        dlon = dx_m / (111320 * np.cos(np.deg2rad(center_lat)))
        lat = center_lat + dlat
        lon = center_lon + dlon
        latlons.append((lon, lat))
    return latlons

def folder_to_gdf(folder, split):
    data = []
    for fname in os.listdir(folder):
        if fname.endswith('.txt'):
            image_name = fname.replace('.txt', '.png')
            lat_str, lon_str = image_name.replace('.png', '').split('_')
            center_lat = float(lat_str)
            center_lon = float(lon_str)
            label_path = os.path.join(folder, fname)
            with open(label_path, 'r') as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) != 9:
                        continue
                    class_id = int(parts[0])
                    coords = list(map(float, parts[1:]))
                    points = np.array(coords).reshape(4,2) * 128
                    latlons = get_latlon_corners(center_lat, center_lon, points)
                    poly = Polygon(latlons)
                    data.append({
                        'image_name': image_name,
                        'class': class_id,
                        'geometry': poly,
                        'split': split
                    })
    return gpd.GeoDataFrame(data, crs="EPSG:4326")

folders = {
    "train": "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/final_data_neurips_2025/train/yolo_obb_labels",
    "val":   "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/final_data_neurips_2025/val/yolo_obb_labels",
    "test":  "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/stratified_split/clean_test/yolo_obb_labels"
}

gdfs = [folder_to_gdf(folders[split], split) for split in ["train", "val", "test"]]
gdf_all = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs="EPSG:4326")


import geopandas as gpd

# Make a buffer of 0.5 meter around each geometry (requires projected CRS, not EPSG:4326)
# Project to UTM (auto) for metric units
gdf_proj = gdf_all.to_crs(gdf_all.estimate_utm_crs())
gdf_proj['buffered'] = gdf_proj.geometry.buffer(16.033)  # 0.5 meter buffer

# Use buffered geometries for overlap analysis
sindex = gdf_proj['buffered'].sindex
common_idx = set()

for idx, geom in enumerate(gdf_proj['buffered']):
    possible_matches = list(sindex.intersection(geom.bounds))
    for other_idx in possible_matches:
        if other_idx == idx:
            continue
        other_geom = gdf_proj['buffered'].iloc[other_idx]
        if geom.intersects(other_geom):
            intersection_area = geom.intersection(other_geom).area
            min_area = min(geom.area, other_geom.area)
            if min_area > 0 and intersection_area / min_area > 1e-6:
                common_idx.add(idx)
                common_idx.add(other_idx)
            elif intersection_area == 0 and geom.touches(other_geom):
                common_idx.add(idx)
                common_idx.add(other_idx)

gdf_all['grouped'] = 'unique'
gdf_all.loc[list(common_idx), 'grouped'] = 'common'
print("Count of 'common' (buffered, touching/overlapping):", (gdf_all['grouped'] == 'common').sum())
print("Count of 'unique' (buffered):", (gdf_all['grouped'] == 'unique').sum())


Count of 'common' (buffered, touching/overlapping): 29972
Count of 'unique' (buffered): 67676


In [27]:
gdf_all

Unnamed: 0,image_name,class,geometry,split,grouped
0,29.6379_74.0288.png,1,"POLYGON ((74.03297 29.64058, 74.033 29.6395, 7...",train,unique
1,29.6379_74.0288.png,1,"POLYGON ((74.03439 29.63623, 74.03505 29.63621...",train,common
2,24.8540_85.0146.png,1,"POLYGON ((85.01144 24.85198, 85.0117 24.85117,...",train,unique
3,23.7261_89.5030.png,2,"POLYGON ((89.49756 23.73071, 89.49789 23.73065...",train,common
4,31.4750_73.3851.png,1,"POLYGON ((73.38709 31.47457, 73.38792 31.47408...",train,unique
...,...,...,...,...,...
97643,25.3086_69.1853.png,1,"POLYGON ((69.18505 25.30658, 69.18576 25.30601...",test,unique
97644,29.6643_77.2661.png,2,"POLYGON ((77.27013 29.66882, 77.27055 29.6685,...",test,unique
97645,25.9761_88.6232.png,2,"POLYGON ((88.62826 25.97504, 88.62855 25.97441...",test,unique
97646,27.4379_80.3073.png,1,"POLYGON ((80.31166 27.43625, 80.31214 27.4354,...",test,unique


In [28]:
# Count of unique kilns in test split
count_unique_test = ((gdf_all['split'] == 'train') & (gdf_all['grouped'] == 'unique')).sum()
print(f"Unique kilns in train split: {count_unique_test}")


Unique kilns in train split: 42545


In [29]:
# Count of unique kilns in test split
count_unique_test = ((gdf_all['split'] == 'val') & (gdf_all['grouped'] == 'unique')).sum()
print(f"Unique kilns in val split: {count_unique_test}")

Unique kilns in val split: 13961


In [30]:
# Count of unique kilns in test split
count_unique_test = ((gdf_all['split'] == 'test') & (gdf_all['grouped'] == 'unique')).sum()
print(f"Unique kilns in test split: {count_unique_test}")

Unique kilns in test split: 11170


In [9]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon

def get_latlon_corners(center_lat, center_lon, points, img_size=128, gsd=10):
    latlons = []
    for x, y in points:
        dx = x - img_size / 2
        dy = y - img_size / 2
        dx_m = dx * gsd
        dy_m = dy * gsd
        dlat = dy_m / 111320
        dlon = dx_m / (111320 * np.cos(np.deg2rad(center_lat)))
        lat = center_lat + dlat
        lon = center_lon + dlon
        latlons.append((lon, lat))
    return latlons

def folder_to_gdf(folder, split):
    data = []
    for fname in os.listdir(folder):
        if fname.endswith('.txt'):
            image_name = fname.replace('.txt', '.png')
            lat_str, lon_str = image_name.replace('.png', '').split('_')
            center_lat = float(lat_str)
            center_lon = float(lon_str)
            label_path = os.path.join(folder, fname)
            with open(label_path, 'r') as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) != 9:
                        continue
                    class_id = int(parts[0])
                    coords = list(map(float, parts[1:]))
                    points = np.array(coords).reshape(4,2) * 128
                    latlons = get_latlon_corners(center_lat, center_lon, points)
                    poly = Polygon(latlons)
                    data.append({
                        'image_name': image_name,
                        'class': class_id,
                        'geometry': poly,
                        'split': split
                    })
    return gpd.GeoDataFrame(data, crs="EPSG:4326")

# ----- Config -----
folders = {
    "train": "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/final_data_neurips_2025/train/yolo_obb_labels",
    "val":   "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/final_data_neurips_2025/val/yolo_obb_labels",
    "test":  "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/final_data_neurips_2025/test/yolo_obb_labels"
}

gdfs = [folder_to_gdf(folders[split], split) for split in ["train", "val", "test"]]
gdf_all = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs="EPSG:4326")

# ----- Buffer geometries (0.5 meter buffer) -----
# Use auto UTM for metric buffering
gdf_proj = gdf_all.to_crs(gdf_all.estimate_utm_crs())
gdf_proj['buffered'] = gdf_proj.geometry.buffer(16.033)   # <-- 0.5 meters!

# Split out for clean overlap analysis
test_proj = gdf_proj[gdf_proj['split'] == 'test'].copy()
trainval_proj = gdf_proj[gdf_proj['split'].isin(['train', 'val'])].copy()

# Build spatial index for train+val
tv_sindex = trainval_proj['buffered'].sindex

def overlaps_trainval(test_geom):
    candidates = list(tv_sindex.intersection(test_geom.bounds))
    # Use .iloc for pandas fancy indexing
    if not candidates:
        return False
    candidate_geoms = trainval_proj['buffered'].iloc[candidates]
    return candidate_geoms.intersects(test_geom).any()

test_proj['overlaps_trainval'] = test_proj['buffered'].apply(overlaps_trainval)

# Filter: clean test set = test instances NOT overlapping train/val
clean_test_proj = test_proj[~test_proj['overlaps_trainval']].copy()

print(f"Total test instances: {len(test_proj)}")
print(f"Clean test instances (no overlap with train/val): {len(clean_test_proj)}")

# If you want the corresponding original rows (EPSG:4326)
clean_test_indices = clean_test_proj.index
clean_test_gdf = gdf_all.loc[clean_test_indices].copy()

# Optional: Save to file
clean_test_gdf.to_file("clean_test_set.geojson", driver="GeoJSON")

# (Optional) View by class or image name:
print(clean_test_gdf['class'].value_counts())


Total test instances: 21100
Clean test instances (no overlap with train/val): 14418
class
1    7859
2    6165
0     394
Name: count, dtype: int64


In [10]:
# --- After you already have:
# test_proj = original buffered test GeoDataFrame (before filtering)
# unique_test_proj = only the unique kilns (after filtering)

# Count original kilns per test image
orig_counts = test_proj.groupby('image_name').size().rename('orig_count')
# Count unique (retained) kilns per test image
unique_counts = clean_test_proj.groupby('image_name').size().rename('unique_count')

# Merge for easy stats
stats_df = pd.concat([orig_counts, unique_counts], axis=1).fillna(0).astype(int)
stats_df['removed'] = stats_df['orig_count'] - stats_df['unique_count']
stats_df['affected'] = stats_df['removed'] > 0
stats_df['all_removed'] = stats_df['unique_count'] == 0


In [19]:
import pandas as pd

n_total = len(stats_df)
n_affected = stats_df['affected'].sum()
n_unaffected = (~stats_df['affected']).sum()
n_all_removed = stats_df['all_removed'].sum()

print(f"Total test label files (images): {n_total}")
print(f"Affected (lost ≥1 kiln):           {n_affected}")
print(f"Not affected (all labels kept):    {n_unaffected}")
print(f"Completely empty (all removed):    {n_all_removed}")

# Histogram of # removed per file (all bins)
removed_hist = stats_df['removed'].value_counts().sort_index()

# Build crosscheck table: removed, num_files, total_kilns_removed
crosscheck = []
for removed, num_files in removed_hist.items():
    total_kilns = removed * num_files
    crosscheck.append([removed, num_files, total_kilns])

crosscheck_table = pd.DataFrame(crosscheck, columns=['removed', 'num_files', 'total_kilns_removed'])

# Add sum row at the bottom
sum_row = pd.DataFrame([{
    'removed': 'SUM',
    'num_files': crosscheck_table['num_files'].sum(),
    'total_kilns_removed': crosscheck_table['total_kilns_removed'].sum()
}])
crosscheck_table = pd.concat([crosscheck_table, sum_row], ignore_index=True)

print("\nSanity check table (with sum):")
print(crosscheck_table)
print("\nMarkdown Table:\n")
print(crosscheck_table.to_markdown(index=False))


Total test label files (images): 15738
Affected (lost ≥1 kiln):           5460
Not affected (all labels kept):    10278
Completely empty (all removed):    4393

Sanity check table (with sum):
   removed  num_files  total_kilns_removed
0        0      10278                    0
1        1       4593                 4593
2        2        657                 1314
3        3        138                  414
4        4         38                  152
5        5         14                   70
6        6          9                   54
7        7          7                   49
8        8          1                    8
9        9          2                   18
10      10          1                   10
11     SUM      15738                 6682

Markdown Table:

| removed   |   num_files |   total_kilns_removed |
|:----------|------------:|----------------------:|
| 0         |       10278 |                     0 |
| 1         |        4593 |                  4593 |
| 2         |         6

In [21]:
import pandas as pd

# -- Crosscheck Table (as before) --
removed_hist = stats_df['removed'].value_counts().sort_index()
crosscheck = []
for removed, num_files in removed_hist.items():
    total_kilns = removed * num_files
    crosscheck.append([removed, num_files, total_kilns])

crosscheck_table = pd.DataFrame(crosscheck, columns=['removed', 'num_files', 'total_kilns_removed'])

# Add sum row at the bottom
sum_row = pd.DataFrame([{
    'removed': 'SUM',
    'num_files': crosscheck_table['num_files'].sum(),
    'total_kilns_removed': crosscheck_table['total_kilns_removed'].sum()
}])
crosscheck_table = pd.concat([crosscheck_table, sum_row], ignore_index=True)

# -- Summary Table --
summary_table = pd.DataFrame([
    ["Total test images (label files)",               len(stats_df)],
    ["Test images now empty (0 unique kilns)",        (stats_df['unique_count'] == 0).sum()],
    ["Test images with ≥1 unique kiln",               (stats_df['unique_count'] > 0).sum()],
    ["Total kiln instances (before filtering)",       stats_df['orig_count'].sum()],
    ["Total unique kiln instances (after filtering)", stats_df['unique_count'].sum()],
    ["Total kilns removed (overlap)",                 stats_df['removed'].sum()],
    ["Test images affected (lost ≥1 kiln)",           (stats_df['removed'] > 0).sum()]
], columns=["Statistic", "Value"])

# -- Print both tables as markdown --
print("\n### Summary Statistics")
print(summary_table.to_markdown(index=False))
print("\n### Sanity Check Table (with sum row)")
print(crosscheck_table.to_markdown(index=False))



### Summary Statistics
| Statistic                                     |   Value |
|:----------------------------------------------|--------:|
| Total test images (label files)               |   15738 |
| Test images now empty (0 unique kilns)        |    4393 |
| Test images with ≥1 unique kiln               |   11345 |
| Total kiln instances (before filtering)       |   21100 |
| Total unique kiln instances (after filtering) |   14418 |
| Total kilns removed (overlap)                 |    6682 |
| Test images affected (lost ≥1 kiln)           |    5460 |

### Sanity Check Table (with sum row)
| removed   |   num_files |   total_kilns_removed |
|:----------|------------:|----------------------:|
| 0         |       10278 |                     0 |
| 1         |        4593 |                  4593 |
| 2         |         657 |                  1314 |
| 3         |         138 |                   414 |
| 4         |          38 |                   152 |
| 5         |          14 |      

| removed   |   num_files |   total_kilns_removed |
|:----------|------------:|----------------------:|
| 0         |       10278 |                     0 |
| 1         |        4593 |                  4593 |
| 2         |         657 |                  1314 |
| 3         |         138 |                   414 |
| 4         |          38 |                   152 |
| 5         |          14 |                    70 |
| 6         |           9 |                    54 |
| 7         |           7 |                    49 |
| 8         |           1 |                     8 |
| 9         |           2 |                    18 |
| 10        |           1 |                    10 |
| SUM       |       15738 |                  6682 |

| removed   |   num_files |   total_kilns_removed |
|:----------|------------:|----------------------:|
| 0         |       10278 |                     0 |
| 1         |        4593 |                  4593 |
| 2         |         657 |                  1314 |
| 3         |         138 |                   414 |
| 4         |          38 |                   152 |
| 5         |          14 |                    70 |
| 6         |           9 |                    54 |
| 7         |           7 |                    49 |
| 8         |           1 |                     8 |
| 9         |           2 |                    18 |
| 10        |           1 |                    10 |
| SUM       |       15738 |                  6682 |

### Summary Statistics
| Statistic                                     |   Value |
|:----------------------------------------------|--------:|
| Total test images (label files)               |   15738 |
| Test images now empty (0 unique kilns)        |    4393 |
| Test images with ≥1 unique kiln               |   11345 |
| Total kiln instances (before filtering)       |   21100 |
| Total unique kiln instances (after filtering) |   14418 |
| Total kilns removed (overlap)                 |    6682 |
| Test images affected (lost ≥1 kiln)           |    5460 |

### Sanity Check Table (with sum row)
| removed   |   num_files |   total_kilns_removed |
|:----------|------------:|----------------------:|
| 0         |       10278 |                     0 |
| 1         |        4593 |                  4593 |
| 2         |         657 |                  1314 |
| 3         |         138 |                   414 |
| 4         |          38 |                   152 |
| 5         |          14 |                    70 |
| 6         |           9 |                    54 |
| 7         |           7 |                    49 |
| 8         |           1 |                     8 |
| 9         |           2 |                    18 |
| 10        |           1 |                    10 |
| SUM       |       15738 |                  6682 |

In [25]:
import os
import shutil

# These are your stats_df index (image_name) for files not affected
unaffected_images = stats_df[stats_df['removed'] == 0].index.tolist()  # e.g. ["23.4283_68.4472.png", ...]
unaffected_basenames = [os.path.splitext(img)[0] for img in unaffected_images]


In [26]:
# test_images_dir = '/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/final_data_neurips_2025/test/images'
test_labels_dir = '/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/final_data_neurips_2025/test/yolo_aa_labels'
# out_images_dir = '/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/stratified_split/clean_test/images'
out_labels_dir = '/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/stratified_split/clean_test/yolo_aa_labels'

# os.makedirs(out_images_dir, exist_ok=True)
os.makedirs(out_labels_dir, exist_ok=True)


In [27]:
for base in unaffected_basenames:
    img_file = base + '.png'
    label_file = base + '.txt'
    # src_img = os.path.join(test_images_dir, img_file)
    src_lbl = os.path.join(test_labels_dir, label_file)
    # dst_img = os.path.join(out_images_dir, img_file)
    dst_lbl = os.path.join(out_labels_dir, label_file)
    # if os.path.exists(src_img):
    #     shutil.copy2(src_img, dst_img)
    if os.path.exists(src_lbl):
        shutil.copy2(src_lbl, dst_lbl)


In [None]:
import os
import shutil

unaffected_images = stats_df[stats_df['removed'] == 0].index.tolist()
unaffected_basenames = [os.path.splitext(img)[0] for img in unaffected_images]

test_labels_dir = '/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/final_data_neurips_2025/test/yolo_aa_labels'
out_labels_dir_1 = '/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/filtered_test_unaffected/yolo_obb_labels'
out_labels_dir_2 = '/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/filtered_test_unaffected/yolo_aa_labels'

os.makedirs(out_labels_dir_1, exist_ok=True)
os.makedirs(out_labels_dir_2, exist_ok=True)

for base in unaffected_basenames:
    label_file = base + '.txt'
    src_lbl = os.path.join(test_labels_dir, label_file)
    dst_lbl1 = os.path.join(out_labels_dir_1, label_file)
    dst_lbl2 = os.path.join(out_labels_dir_2, label_file)
    if os.path.exists(src_lbl):
        shutil.copy2(src_lbl, dst_lbl1)
        shutil.copy2(src_lbl, dst_lbl2)


In [4]:
import geopandas as gpd
import pandas as pd

# Project to metric CRS for buffering
gdf_proj = gdf_all.to_crs(gdf_all.estimate_utm_crs())
gdf_proj['buffered'] = gdf_proj.geometry.buffer((16.033))   # 0.5 meter buffer


def count_unique_kilns(gdf_proj, split):
    gdf_split = gdf_proj[gdf_proj['split'] == split].copy()
    # Use spatial join to find overlapping polygons (kilns)
    sindex = gdf_split['buffered'].sindex
    visited = set()
    groups = []
    for idx, geom in gdf_split['buffered'].items():
        if idx in visited:
            continue
        # Find all others that touch this one
        matches = set(sindex.intersection(geom.bounds))
        group = {idx}
        for other in matches:
            if other != idx:
                other_geom = gdf_split['buffered'].iloc[other]
                if geom.intersects(other_geom):
                    group.add(other)
        groups.append(group)
        visited.update(group)
    # Each group represents a unique kiln
    return len(groups)

n_unique_train = count_unique_kilns(gdf_proj, 'train')
n_unique_val   = count_unique_kilns(gdf_proj, 'val')
n_unique_test  = count_unique_kilns(gdf_proj, 'test')

print(f"Unique train kilns: {n_unique_train}")
print(f"Unique val kilns:   {n_unique_val}")
print(f"Unique test kilns:  {n_unique_test}")



Unique train kilns: 55493
Unique val kilns:   21042
Unique test kilns:  21100
