In [2]:
import os
import json
import shutil
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold

# Inputs
csv_paths = [
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/brick_kilns_neurips_2025/a/seasons1_lucknow_airshed_land_cover_distribution.csv",
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/brick_kilns_neurips_2025/a/seasons2_lucknow_airshed_land_cover_distribution.csv",
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/brick_kilns_neurips_2025/a/seasons3_lucknow_airshed_land_cover_distribution.csv",
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/brick_kilns_neurips_2025/a/seasons4_lucknow_airshed_land_cover_distribution.csv",
]

img_roots = [
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/seasons1_lucknow_airshed/images",
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/seasons2_lucknow_airshed/images",
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/seasons3_lucknow_airshed/images",
    "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/processed_data/sentinel/seasons4_lucknow_airshed/images",
]

# Output root
out_root = Path("classified_dataset")
(out_root / "train" / "images").mkdir(parents=True, exist_ok=True)
(out_root / "test" / "images").mkdir(parents=True, exist_ok=True)

# Canonical class order for id mapping
landcover_cols = [
    "Tree cover",
    "Shrubland",
    "Grassland",
    "Cropland",
    "Built-up",
    "Bare / sparse vegetation",
    "Permanent water bodies",
    "Herbaceous wetland",
]

# Build mapping dicts
class_to_id = {c: i for i, c in enumerate(landcover_cols)}
id_to_class = {i: c for c, i in class_to_id.items()}

# Save mapping
with open(out_root / "class_to_id.json", "w") as f:
    json.dump(class_to_id, f, indent=2)
with open(out_root / "id_to_class.json", "w") as f:
    json.dump(id_to_class, f, indent=2)
with open(out_root / "classes.txt", "w") as f:
    for i in range(len(landcover_cols)):
        f.write(f"{i}\t{id_to_class[i]}\n")

# Load, merge, and assign labels
dfs = []
for season_no, (csv_path, img_root) in enumerate(zip(csv_paths, img_roots), 1):
    df = pd.read_csv(csv_path)
    df["season_no"] = season_no
    df["image_path"] = df["filename"].apply(lambda x: os.path.join(img_root, x))

    # lat lon from filename
    latlon = df["filename"].str.replace(".png", "", regex=False).str.split("_", expand=True)
    df["lat"] = latlon[0].astype(float)
    df["lon"] = latlon[1].astype(float)
    df["latlon"] = df["lat"].round(6).astype(str) + "_" + df["lon"].round(6).astype(str)

    # argmax landcover
    df["landcover_type"] = df[landcover_cols].idxmax(axis=1)
    df["landcover_id"] = df["landcover_type"].map(class_to_id).astype(int)

    dfs.append(df[["image_path", "season_no", "lat", "lon", "latlon", "landcover_type", "landcover_id"]])

full_df = pd.concat(dfs, ignore_index=True)

# drop rows with missing images
exists_mask = full_df["image_path"].apply(os.path.exists)
full_df = full_df[exists_mask].reset_index(drop=True)

# filter classes with at least two samples for stratification
counts = full_df["landcover_id"].value_counts().sort_index()
keep_ids = counts[counts >= 2].index.tolist()
if len(keep_ids) < 2:
    # fall back to non stratified if almost all are singletons
    train_df, test_df = train_test_split(
        full_df, test_size=0.40, random_state=42, shuffle=True
    )
else:
    filtered_df = full_df[full_df["landcover_id"].isin(keep_ids)].reset_index(drop=True)
    counts2 = filtered_df["landcover_id"].value_counts().sort_index()

    # try stratified split
    try:
        train_df, test_df = train_test_split(
            filtered_df,
            test_size=0.40,
            random_state=42,
            stratify=filtered_df["landcover_id"]
        )
    except ValueError:
        # second attempt with StratifiedKFold
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        y = filtered_df["landcover_id"].values
        train_idx, test_idx = next(skf.split(filtered_df, y))
        train_df = filtered_df.iloc[train_idx].reset_index(drop=True)
        test_df = filtered_df.iloc[test_idx].reset_index(drop=True)

# Persist labels
train_df.to_csv(out_root / "train_labels.csv", index=False)
test_df.to_csv(out_root / "test_labels.csv", index=False)

# Copy images
def copy_split(df, split):
    dst_dir = out_root / split / "images"
    for p in df["image_path"]:
        shutil.copy(p, dst_dir / os.path.basename(p))

copy_split(train_df, "train")
copy_split(test_df, "test")

# Quick distribution sanity checks
train_counts = train_df["landcover_id"].value_counts().sort_index()
test_counts = test_df["landcover_id"].value_counts().sort_index()
train_counts.to_csv(out_root / "train_class_counts.csv")
test_counts.to_csv(out_root / "test_class_counts.csv")

print("Done")
print("Train size:", len(train_df))
print("Test size:", len(test_df))
print("Classes kept for stratification:", sorted(train_df["landcover_id"].unique().tolist()))
print("Class mapping:", class_to_id)


Done
Train size: 12441
Test size: 8294
Classes kept for stratification: [0, 1, 2, 3, 4, 5]
Class mapping: {'Tree cover': 0, 'Shrubland': 1, 'Grassland': 2, 'Cropland': 3, 'Built-up': 4, 'Bare / sparse vegetation': 5, 'Permanent water bodies': 6, 'Herbaceous wetland': 7}


In [6]:
import os
import json
import math
import argparse
from pathlib import Path
from typing import Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import pandas as pd
from tqdm import tqdm

# ----------------------------
# Import Galileo encoder
# ----------------------------
# --- Add Galileo to Path and Import ---
sys.path.insert(0, str(Path.cwd() / "../models"))
print(sys.path)
try:
    from galileo import (
        Encoder as GalileoEncoder,
        SPACE_TIME_BANDS, SPACE_TIME_BANDS_GROUPS_IDX,
        SPACE_BANDS, SPACE_BAND_GROUPS_IDX,
        TIME_BANDS, TIME_BAND_GROUPS_IDX,
        STATIC_BANDS, STATIC_BAND_GROUPS_IDX
    )
except ImportError:
    print("Error: Could not import Galileo model. Make sure 'models/galileo.py' exists.")
    sys.exit(1)

['/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/Foundation-Models/iclr_2026/notebooks/../models', '/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/Foundation-Models/iclr_2026/notebooks/models', '/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/Foundation-Models/iclr_2026/notebooks/models', '/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/Foundation-Models/iclr_2026/notebooks/models', '/opt/anaconda3/envs/rishabh_sat/lib/python312.zip', '/opt/anaconda3/envs/rishabh_sat/lib/python3.12', '/opt/anaconda3/envs/rishabh_sat/lib/python3.12/lib-dynload', '', '/home/rishabh.mondal/.local/lib/python3.12/site-packages', '/opt/anaconda3/envs/rishabh_sat/lib/python3.12/site-packages', '/opt/anaconda3/envs/rishabh_sat/lib/python3.12/site-packages/ISR-2.2.0-py3.12.egg', '/opt/anaconda3/envs/rishabh_sat/lib/python3.12/site-packages/setuptools/_vendor', '/home/rishabh.mondal/solo-learn', '/tmp/tmp57e8byie']


usage: ipykernel_launcher.py [-h] --train_csv TRAIN_CSV --test_csv TEST_CSV
                             --galileo_ckpt GALILEO_CKPT [--out_dir OUT_DIR]
                             [--epochs EPOCHS] [--batch_size BATCH_SIZE]
                             [--lr LR] [--weight_decay WEIGHT_DECAY]
                             [--img_size IMG_SIZE] [--num_classes NUM_CLASSES]
                             [--freeze_backbone] [--pool POOL]
ipykernel_launcher.py: error: argument --freeze_backbone: ignored explicit argument '/run/user/1023/jupyter/runtime/kernel-v328344a32963654e05365706f4269d84bd597375d.json'


AttributeError: 'tuple' object has no attribute 'tb_frame'