In [1]:
# Clone the InstaGeo-E2E-Geospatial-ML repository from GitHub
repository_url = "https://github.com/instadeepai/InstaGeo-E2E-Geospatial-ML"
!git clone {repository_url}

Cloning into 'InstaGeo-E2E-Geospatial-ML'...
remote: Enumerating objects: 374, done.[K
remote: Counting objects: 100% (105/105), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 374 (delta 80), reused 61 (delta 61), pack-reused 269 (from 1)[K
Receiving objects: 100% (374/374), 1.42 MiB | 10.33 MiB/s, done.
Resolving deltas: 100% (207/207), done.


In [2]:
%%bash
cd InstaGeo-E2E-Geospatial-ML
git pull

Already up to date.


In [3]:
%%capture
%%bash
# Navigate to the cloned InstaGeo-E2E-Geospatial-ML directory
cd /kaggle/working/InstaGeo-E2E-Geospatial-ML
# Stash any local changes to avoid conflicts when switching branches
git stash
#Switch to the 'geo-ai-hack' branch, which likely contains specific code for the Geo AI Hackathon
git checkout geo-ai-hack
# Install the InstaGeo package 
pip install -e .[all]

In [4]:
# Import necessary libraries
import os
import re
import shutil
import yaml
import pandas as pd
import numpy as np
from pathlib import Path
from pyproj import CRS, Transformer
import rasterio
os.environ["HYDRA_FULL_ERROR"] ="1"

In [5]:
import os
import re
import shutil
import os
import shutil
from pathlib import Path

def copy_last_25_percent_with_seg_maps(
    input_chips_dir,
    input_segmaps_dir,
    output_chips_dir,
    output_segmaps_dir
):
    """
    Copie les 25% de fichiers .tif "depuis la fin" du dossier `input_chips_dir`
    vers `output_chips_dir`, et copie aussi les seg_maps correspondantes
    (même nom, en remplaçant "chip_" par "seg_map_").

    Exemple:
      - input_chips_dir = "/kaggle/input/hls_train/hls_train/chips"
      - input_segmaps_dir = "/kaggle/input/hls_train/hls_train/seg_maps"
      - output_chips_dir = "/kaggle/working/hls_train_subset/hls_train_subset/chips"
      - output_segmaps_dir = "/kaggle/working/hls_train_subset/hls_train_subset/seg_maps"
    """
    input_chips_dir = Path(input_chips_dir)
    input_segmaps_dir = Path(input_segmaps_dir)
    output_chips_dir = Path(output_chips_dir)
    output_segmaps_dir = Path(output_segmaps_dir)

    output_chips_dir.mkdir(parents=True, exist_ok=True)
    output_segmaps_dir.mkdir(parents=True, exist_ok=True)

    # Lister tous les TIF dans chips
    all_chip_files = [f for f in os.listdir(input_chips_dir) if f.endswith(".tif")]
    all_chip_files.sort()  # Tri alphabétique (si votre naming est chronologique, ça marche)
    total = len(all_chip_files)
    if total == 0:
        print(f"[WARNING] Aucun TIFF dans {input_chips_dir}")
        return

    # Calcul du nombre de fichiers à prendre (25%)
    subset_count = int(total * 0.31)
    # On prend les "derniers" subset_count
    subset_chips = all_chip_files[-subset_count:]  # depuis la fin

    print(f"[INFO] Trouvé {total} TIFF dans {input_chips_dir}")
    print(f"       On copie les {subset_count} derniers vers {output_chips_dir}")

    copied = 0
    for chip_file in subset_chips:
        src_chip = input_chips_dir / chip_file
        dst_chip = output_chips_dir / chip_file
        shutil.copy2(src_chip, dst_chip)
        copied += 1

        # Copier la seg_map correspondante si elle existe
        seg_file = chip_file.replace("chip_", "seg_map_")
        seg_src = input_segmaps_dir / seg_file
        if seg_src.exists():
            dst_seg = output_segmaps_dir / seg_file
            shutil.copy2(seg_src, dst_seg)

    print(f"[OK] {copied} fichiers copiés dans {output_chips_dir} + seg_maps associées.")


In [6]:
copy_last_25_percent_with_seg_maps(
    input_chips_dir   = "/kaggle/input/geo-ai-hack/hls_train/hls_train/chips",
    input_segmaps_dir = "/kaggle/input/geo-ai-hack/hls_train/hls_train/seg_maps",
    output_chips_dir  = "/kaggle/working/hls_train_subset/hls_train_subset/chips",
    output_segmaps_dir= "/kaggle/working/hls_train_subset/hls_train_subset/seg_maps"
)


[INFO] Trouvé 10428 TIFF dans /kaggle/input/geo-ai-hack/hls_train/hls_train/chips
       On copie les 3232 derniers vers /kaggle/working/hls_train_subset/hls_train_subset/chips
[OK] 3232 fichiers copiés dans /kaggle/working/hls_train_subset/hls_train_subset/chips + seg_maps associées.


In [7]:
import os
import numpy as np
import rasterio
from pathlib import Path

def replace_swir2_with_ndwi_inplace_multitime_no_spike(chips_dir):
    """
    Pour chaque TIFF dans chips_dir (shape [18, H, W] = 3 time steps × 6 bandes):
      - Lecture en mémoire, suppression immédiate du fichier
      - Calcul NDWI pour chaque time step:
         NDWI = (NIR - SWIR1) / (NIR + SWIR1 + 1e-6)
      - On remplace la bande SWIR2 (index 5) par NDWI
      - On réécrit au même chemin un TIFF de shape [18, H, W],
        où la bande 5 (SWIR2) est devenue NDWI.

    Structure initiale par time step (6 bandes):
       0 = Blue
       1 = Green
       2 = Red
       3 = NIR
       4 = SWIR1
       5 = SWIR2  --> on va la remplacer par NDWI
    """

    chips_dir = Path(chips_dir)
    for chip_file in os.listdir(chips_dir):
        if not chip_file.endswith(".tif"):
            continue

        chip_path = chips_dir / chip_file

        # 1) Lecture en RAM
        with rasterio.open(chip_path) as src:
            array = src.read().astype(np.float32)  # shape (18, H, W)
            profile = src.profile

        # 2) Supprime aussitôt le TIFF original (libère l'espace)
        os.remove(chip_path)

        # 3) Vérifions qu'on a 18 canaux (3 time steps × 6 bandes)
        if array.shape[0] != 18:
            print(f"[SKIP] {chip_file}: {array.shape[0]} canaux (pas 18).")
            # On réécrit le fichier d'origine pour ne pas le perdre
            with rasterio.open(chip_path, "w", **profile) as dst:
                dst.write(array)
            continue

        time_steps = 3
        bands_per_ts = 6

        # 4) Calcul NDWI pour chacun des 3 pas de temps
        for t in range(time_steps):
            offset_in = t * bands_per_ts

            # Indices: 
            # 3 = NIR, 4 = SWIR1, 5 = SWIR2 (à remplacer)
            nir   = array[offset_in + 3]
            swir1 = array[offset_in + 4]

            ndwi = (nir - swir1) / (nir + swir1 + 1e-6)

            # Remplace la bande SWIR2 (index 5) par NDWI
            array[offset_in + 5] = ndwi

        # 5) On garde la même shape (18 canaux)
        profile.update(dtype="float32", count=18)

        # 6) Écrit le nouveau TIFF
        with rasterio.open(chip_path, "w", **profile) as dst:
            dst.write(array)

    print(f"[OK] SWIR2 remplacé par NDWI (in-place, 18 canaux) dans : {chips_dir}")


In [9]:
replace_swir2_with_ndwi_inplace_multitime_no_spike("/kaggle/working/hls_train_subset/hls_train_subset/chips")


[OK] SWIR2 remplacé par NDWI (in-place, 18 canaux) dans : /kaggle/working/hls_train_subset/hls_train_subset/chips


In [10]:
chips_dir = "/kaggle/working/hls_train_subset/hls_train_subset/chips"  # adaptez le chemin à votre cas

# Lister tous les fichiers TIFF du dossier
tif_files = [f for f in os.listdir(chips_dir) if f.endswith(".tif")]

if tif_files:
    first_tif = tif_files[0]
    first_tif_path = os.path.join(chips_dir, first_tif)

    with rasterio.open(first_tif_path) as src:
        # Méthode 1: lire la totalité des canaux en mémoire
        array = src.read()  
        # array.shape renvoie (nombre_canaux, hauteur, largeur)
        print("array.shape :", array.shape)

        # Méthode 2: accéder directement aux attributs
        print("src.count  :", src.count)   # nombre de canaux
        print("src.height :", src.height)
        print("src.width  :", src.width)

else:
    print("Aucun fichier .tif trouvé dans", chips_dir)


array.shape : (18, 256, 256)
src.count  : 18
src.height : 256
src.width  : 256


In [11]:
def generate_label_mapping(root_dir, input_subdir, output_csv):
    """
    Generate a CSV mapping input chips to corresponding segmentation maps.

    Args:
        root_dir (str or Path): Root directory containing the subdirectories for chips and segmentation maps.
        input_subdir (str): Subdirectory path for chips within the root directory.
        output_csv (str or Path): Output path for the generated CSV file.
    """
    root_dir = Path(root_dir)
    chips_orig = os.listdir(root_dir / input_subdir / "chips")
    if os.path.exists(root_dir / input_subdir / "seg_maps"):
        add_label = True
    else:
        add_label = False

    chips = [chip.replace("chip", f"{input_subdir}/chips/chip") for chip in chips_orig]

    if add_label:
        seg_maps = [chip.replace("chip", f"{input_subdir}/seg_maps/seg_map") for chip in chips_orig]
        df = pd.DataFrame({"Input": chips, "Label": seg_maps})
    else:
        df = pd.DataFrame({"Input": chips})
    df.to_csv(output_csv, index=False)
    
    print(f"Number of rows is: {df.shape[0]}")
    print(f"CSV generated and saved to: {output_csv}")

In [12]:
# Par exemple, on a transformé /kaggle/working/hls_train/hls_train/chips en 8 bandes

root_dir = "/kaggle/working"  # Le dossier principal
generate_label_mapping(
    root_dir=root_dir,
    input_subdir="hls_train_subset/hls_train_subset",  # Chemin vers vos chips 
    output_csv="train_ds_subset.csv"
)

#il faudra ajouter le test ensuite

Number of rows is: 3232
CSV generated and saved to: train_ds_subset.csv


In [13]:
def split_validation_data(mapping_csv, validation_split=0.3):
    """
    Split data into training and validation sets based on a CSV file mapping `chips` and `seg_maps`.

    Args:
        mapping_csv (str or Path): Path to the CSV file containing the mapping between `chips` and `seg_maps`.
        data_dir (str or Path): Path to the merged directory containing all files.
        validation_dir (str or Path): Path to the new directory for validation files.
        validation_split (float): Fraction of the data to use as the validation set.
    """
    df = pd.read_csv(mapping_csv)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    num_val = int(len(df) * validation_split)
    train_df = df[num_val:]
    val_df = df[:num_val]
    train_df.to_csv("train_split.csv",index=False)    
    print(f"CSV train split  saved to: train_split.csv")
    val_df.to_csv("validation_split.csv",index=False)    
    print(f"CSV validation split  saved to: validation_split.csv")
    
    return 
    

In [14]:
split_validation_data("train_ds_subset.csv", validation_split=0.3)
# Cela va créer train_split.csv et validation_split.csv


CSV train split  saved to: train_split.csv
CSV validation split  saved to: validation_split.csv


In [15]:
#on vérifie que notre image a le bon nombre de bandes 

with rasterio.open("hls_train_subset/hls_train_subset/chips/chip_20201201_S30_T37QGB_2020304T074041_3_4.tif") as src : 
    print(src.count)

18


In [16]:
!python -m instageo.model.run \
    --config-name=locust \
    root_dir="/kaggle/working" \
    train_filepath="train_ds_subset.csv" \
    dataloader.temporal_dim=3 \
    train.batch_size=8 \
    train.num_epochs=1 \
    mode=stats \
    dataloader.bands=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]\
    dataloader.mean=[623.2724609375,1247.657958984375,1772.24169921875,2371.256103515625,2862.867431640625,2357.759765625]\
    dataloader.std=[2182.050048828125,2248.420654296875,2302.53515625,2372.204345703125,2398.52685546875,2292.96435546875]



[2025-02-05 12:39:44,080][__main__][INFO] - Script: /kaggle/working/InstaGeo-E2E-Geospatial-ML/instageo/model/run.py
[2025-02-05 12:39:44,083][__main__][INFO] - Imported hydra config:
root_dir: /kaggle/working
valid_filepath: null
train_filepath: train_ds_subset.csv
test_filepath: null
checkpoint_path: null
output_dir: null
mode: stats
train:
  learning_rate: 0.0001
  num_epochs: 1
  batch_size: 8
  class_weights:
  - 1
  - 1
  ignore_index: -1
  weight_decay: 0.1
model:
  freeze_backbone: false
  num_classes: 2
dataloader:
  bands:
  - 0
  - 1
  - 2
  - 3
  - 4
  - 5
  - 6
  - 7
  - 8
  - 9
  - 10
  - 11
  - 12
  - 13
  - 14
  - 15
  - 16
  - 17
  mean:
  - 623.2724609375
  - 1247.657958984375
  - 1772.24169921875
  - 2371.256103515625
  - 2862.867431640625
  - 2357.759765625
  std:
  - 2182.050048828125
  - 2248.420654296875
  - 2302.53515625
  - 2372.204345703125
  - 2398.52685546875
  - 2292.96435546875
  img_size: 256
  temporal_dim: 3
  replace_label:
  - -9999
  - -1
  reduce_to

In [17]:
def load_yml(filepath):
    """Load data from a YAML file.

    Args:
        filepath (str | Path): The path to the YAML file.

    Returns:
        Dict: The loaded data, or None if the file does not exist.
    """
    filepath=Path(filepath)
    with filepath.open() as f:
        return yaml.safe_load(f)
        
def save_yml(data,filepath):
    """Save data to a YAML file.

    Args:
        data (Dict): The data to save.
        filepath (str | Path): The file path to save the YAML to.
    """
    filepath = Path(filepath)
    with filepath.open("w") as f:
        yaml.dump(data, f)
    print(f"Data saved to {filepath}.")

In [18]:

locust_cfg_path = "InstaGeo-E2E-Geospatial-ML/instageo/model/configs/locust.yaml"
locust_cfg = load_yml(locust_cfg_path)

# Recopiez les valeurs mean/std découvertes dans la sortie ci-dessus
locust_cfg["mean"] = [867.623046875, 1439.7684326171875, 1951.5145263671875, 2704.58984375, 3148.905029296875, -60.25557327270508]  
locust_cfg["std"]  = [1995.3145751953125, 2033.8626708984375, 2060.29931640625, 2110.346923828125, 2111.359619140625, 139875.90625]

# Si besoin, configurer in_channels=8 (selon la version d'InstaGeo)
#locust_cfg["model"]["in_channels"] = 8

save_yml(locust_cfg, locust_cfg_path)


Data saved to InstaGeo-E2E-Geospatial-ML/instageo/model/configs/locust.yaml.


In [19]:
import pandas as pd
import rasterio
import os

df = pd.read_csv("train_ds_subset.csv")

root_dir = "/kaggle/working"  # adaptez si nécessaire
shapes_count = {}

for path_rel in df["Input"]:
    path_abs = os.path.join(root_dir, path_rel)
    if not os.path.exists(path_abs):
        print(f"[WARNING] Fichier introuvable : {path_abs}")
        continue
    with rasterio.open(path_abs) as src:
        band_count = src.count
    shapes_count[band_count] = shapes_count.get(band_count, 0) + 1

print("Résumé des shapes rencontrées (nombre de canaux) :")
for band_count, nb_files in shapes_count.items():
    print(f" - {band_count} canaux : {nb_files} fichier(s)")


Résumé des shapes rencontrées (nombre de canaux) :
 - 18 canaux : 3232 fichier(s)


In [None]:
# Train the InstaGeo model using the Locust configuration
!python -m instageo.model.run  --config-name=locust \
    hydra.run.dir="/kaggle/working/outputs/first_run" \
    root_dir="/kaggle/working" \
    train.batch_size=8 \
    train.num_epochs=10\
    mode=train \
    dataloader.mean="[867.623046875, 1439.7684326171875, 1951.5145263671875, 2704.58984375, 3148.905029296875, -60.25557327270508]"\
    dataloader.std="[1995.3145751953125, 2033.8626708984375, 2060.29931640625, 2110.346923828125, 2111.359619140625, 139875.90625]"\
    train_filepath="train_ds_subset.csv" \
    valid_filepath="validation_split.csv"

[2025-02-05 12:43:58,970][__main__][INFO] - Script: /kaggle/working/InstaGeo-E2E-Geospatial-ML/instageo/model/run.py
[2025-02-05 12:43:58,974][__main__][INFO] - Imported hydra config:
checkpoint_path: null
dataloader:
  bands:
  - 0
  - 1
  - 2
  - 3
  - 4
  - 5
  - 6
  - 7
  - 8
  - 9
  - 10
  - 11
  - 12
  - 13
  - 14
  - 15
  - 16
  - 17
  constant_multiplier: 1.0
  img_size: 256
  mean:
  - 867.623046875
  - 1439.7684326171875
  - 1951.5145263671875
  - 2704.58984375
  - 3148.905029296875
  - -60.25557327270508
  no_data_value: -9999
  reduce_to_zero: false
  replace_label:
  - -9999
  - -1
  std:
  - 1995.3145751953125
  - 2033.8626708984375
  - 2060.29931640625
  - 2110.346923828125
  - 2111.359619140625
  - 139875.90625
  temporal_dim: 3
mean:
- 867.623046875
- 1439.7684326171875
- 1951.5145263671875
- 2704.58984375
- 3148.905029296875
- -60.25557327270508
mode: train
model:
  freeze_backbone: false
  num_classes: 2
output_dir: null
root_dir: /kaggle/working
std:
- 1995.31457519

In [None]:
generate_label_mapping('/kaggle/input/geo-ai-hack', 'hls_test/hls_test', "test_ds.csv")

In [None]:
%%bash
python -m instageo.model.run --config-name=locust \
    root_dir="/kaggle/input/geo-ai-hack" \
    test_filepath="validation_split.csv" \
    train.batch_size=16 \
    checkpoint_path='/kaggle/working/outputs/first_run/instageo_best_checkpoint.ckpt' \
    mode=eval

In [None]:
%%bash
python -m instageo.model.run --config-name=locust \
    root_dir="/kaggle/input/geo-ai-hack" \
    test_filepath="test_ds.csv" \
    train.batch_size=16 \
    checkpoint_path='/kaggle/working/outputs/first_run/instageo_best_checkpoint.ckpt' \
    output_dir='/kaggle/working/predictions' \
    mode=chip_inference

In [None]:


predictions_directory = "/kaggle/working/predictions"
prediction_files = os.listdir(predictions_directory)

def get_prediction_value(row):
    matching_files = [f for f in prediction_files if (str(row['date']) in f) and (row['mgrs_tile_id'] in f)]
    if not matching_files:
        return (np.nan, np.nan)
    for file in matching_files:
        with rasterio.open(f"{predictions_directory}/{file}") as src:
            width, height = src.width, src.height
            affine_transform = rasterio.transform.AffineTransformer(src.transform)
            transformer = Transformer.from_crs(CRS.from_epsg(4326), src.crs, always_xy=True)
            x_chip, y_chip = transformer.transform(row['x'], row['y'])
            x_offset, y_offset = affine_transform.rowcol(x_chip, y_chip)
            
            if 0 <= x_offset < width and 0 <= y_offset < height:
                return src.read(1)[y_offset, x_offset], file
    return (np.nan, np.nan)

In [None]:
submission_df = pd.read_csv("/kaggle/input/geo-ai-hack/test.csv")

submission_df[['prediction', 'filename']] = submission_df.apply(get_prediction_value, axis=1, result_type='expand')
submission_df[["id","prediction"]].to_csv("hls_submission.csv",index=False)

In [None]:
ls kaggle/working