## V4 Dataset


V4 is derivated from V3 dataset

- Only processing basaleline N0500 + 
- Remove outlier such as Unconsolidated products
- Val and test are inverted 

In [10]:
import random
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import pandas as pd
from PIL import Image
import natsort

In [11]:
def validate_data_alignment(df_l1c, df_l2a):
    """Validate that the data is properly aligned"""
    mismatches = 0
    for i in range(min(len(df_l1c), len(df_l2a))):
        if df_l1c['id_key'][i] != df_l2a['id_key'][i]:
            print(f"Mismatch: {df_l1c['id_key'][i]} != {df_l2a['id_key'][i]}")
            mismatches += 1

    if mismatches == 0:
        print(f"All {len(df_l1c)} records are properly aligned")
    else:
        print(f"Found {mismatches} mismatches in data alignment")

In [3]:
def prepare_paths(path_dir):

    df_input = pd.read_csv(f"{path_dir}/input.csv")
    df_output = pd.read_csv(f"{path_dir}/target.csv")

    df_input["path"] = df_input["Name"].apply(lambda x: os.path.join(path_dir, "input", os.path.basename(x).replace(".SAFE","")))
    df_output["path"] = df_output["Name"].apply(lambda x: os.path.join(path_dir, "target", os.path.basename(x).replace(".SAFE","")))

    return df_input, df_output

In [12]:
version = "V4"
TRAIN_DIR = f"/mnt/disk/dataset/sentinel-ai-processor/{version}/train/"
VAL_DIR = f"/mnt/disk/dataset/sentinel-ai-processor/{version}/val/"
TEST_DIR = f"/mnt/disk/dataset/sentinel-ai-processor/{version}/test/"
df_train_input, df_train_output =  prepare_paths(TRAIN_DIR)
df_val_input, df_val_output =  prepare_paths(VAL_DIR)
df_test_input, df_test_output =  prepare_paths(TEST_DIR)

In [16]:
print(f"Train data sample: {len(df_train_input), len(df_train_output)}")
print(f"Val data sample: {len(df_val_input), len(df_val_output)}")
print(f"Test data sample: {len(df_test_input), len(df_test_output)}")

Train data sample: (4227, 4227)
Val data sample: (530, 530)
Test data sample: (516, 516)


In [23]:
df_train_input

Unnamed: 0,id_key,Name,S3Path,Footprint,GeoFootprint,cloud_cover,path,processing_baseline
0,S2A_MSIL1C_20180102T102421_N0500_R065_T32TMT,S2A_MSIL1C_20180102T102421_N0500_R065_T32TMT_2...,/eodata/Sentinel-2/MSI/L1C_N0500/2018/01/02/S2...,geography'SRID=4326;POLYGON ((7.75237634651315...,"{'type': 'Polygon', 'coordinates': [[[7.752376...",51.648385,/mnt/disk/dataset/sentinel-ai-processor/V4/tra...,500
1,S2A_MSIL1C_20180102T102421_N0500_R065_T32UPU,S2A_MSIL1C_20180102T102421_N0500_R065_T32UPU_2...,/eodata/Sentinel-2/MSI/L1C_N0500/2018/01/02/S2...,geography'SRID=4326;POLYGON ((10.3602792462837...,"{'type': 'Polygon', 'coordinates': [[[10.36027...",84.399474,/mnt/disk/dataset/sentinel-ai-processor/V4/tra...,500
2,S2A_MSIL1C_20180102T102421_N0500_R065_T32UPV,S2A_MSIL1C_20180102T102421_N0500_R065_T32UPV_2...,/eodata/Sentinel-2/MSI/L1C_N0500/2018/01/02/S2...,geography'SRID=4326;POLYGON ((10.3851737331767...,"{'type': 'Polygon', 'coordinates': [[[10.38517...",92.803279,/mnt/disk/dataset/sentinel-ai-processor/V4/tra...,500
3,S2A_MSIL1C_20180102T102421_N0500_R065_T32UMU,S2A_MSIL1C_20180102T102421_N0500_R065_T32UMU_2...,/eodata/Sentinel-2/MSI/L1C_N0500/2018/01/02/S2...,geography'SRID=4326;POLYGON ((8.07866651500257...,"{'type': 'Polygon', 'coordinates': [[[8.078666...",42.678670,/mnt/disk/dataset/sentinel-ai-processor/V4/tra...,500
4,S2A_MSIL1C_20180102T102421_N0500_R065_T32UMV,S2A_MSIL1C_20180102T102421_N0500_R065_T32UMV_2...,/eodata/Sentinel-2/MSI/L1C_N0500/2018/01/02/S2...,geography'SRID=4326;POLYGON ((8.41368654470404...,"{'type': 'Polygon', 'coordinates': [[[8.413686...",62.201688,/mnt/disk/dataset/sentinel-ai-processor/V4/tra...,500
...,...,...,...,...,...,...,...,...
4222,S2C_MSIL1C_20250227T103021_N0511_R108_T31UGR,S2C_MSIL1C_20250227T103021_N0511_R108_T31UGR_2...,/eodata/Sentinel-2/MSI/L1C/2025/02/27/S2C_MSIL...,geography'SRID=4326;POLYGON ((6.21294280425978...,"{'type': 'Polygon', 'coordinates': [[[6.212942...",45.641285,/mnt/disk/dataset/sentinel-ai-processor/V4/tra...,511
4223,S2C_MSIL1C_20250227T103021_N0511_R108_T31UGP,S2C_MSIL1C_20250227T103021_N0511_R108_T31UGP_2...,/eodata/Sentinel-2/MSI/L1C/2025/02/27/S2C_MSIL...,geography'SRID=4326;POLYGON ((5.68779703536694...,"{'type': 'Polygon', 'coordinates': [[[5.687797...",51.466775,/mnt/disk/dataset/sentinel-ai-processor/V4/tra...,511
4224,S2B_MSIL1C_20250228T104909_N0511_R051_T31UFP,S2B_MSIL1C_20250228T104909_N0511_R051_T31UFP_2...,/eodata/Sentinel-2/MSI/L1C/2025/02/28/S2B_MSIL...,geography'SRID=4326;POLYGON ((5.04386674293589...,"{'type': 'Polygon', 'coordinates': [[[5.043866...",68.644181,/mnt/disk/dataset/sentinel-ai-processor/V4/tra...,511
4225,S2B_MSIL1C_20250301T101849_N0511_R065_T32TNS,S2B_MSIL1C_20250301T101849_N0511_R065_T32TNS_2...,/eodata/Sentinel-2/MSI/L1C/2025/03/01/S2B_MSIL...,geography'SRID=4326;POLYGON ((8.99973715756221...,"{'type': 'Polygon', 'coordinates': [[[8.999737...",42.113142,/mnt/disk/dataset/sentinel-ai-processor/V4/tra...,511
