In [1]:
import pandas as pd
import numpy as np

IN_PATH  = "sentinel2_features.csv"
OUT_PATH = "sentinel2_features_clean.csv"

BANDS = ["B02","B03","B04","B08","B11","B12"]
IDX   = ["NDVI","NDWI","NDMI"]

df = pd.read_csv(IN_PATH)
df.columns = df.columns.str.strip()

if "Sample Date" in df.columns:
    df["Sample Date"] = pd.to_datetime(df["Sample Date"], errors="coerce", dayfirst=True)

if "s2_datetime" in df.columns:
    df["s2_datetime"] = pd.to_datetime(df["s2_datetime"], errors="coerce", utc=True)

num_cols = [c for c in ["Latitude","Longitude","s2_cloud"] + BANDS if c in df.columns]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

required_base = ["Latitude","Longitude","Sample Date","s2_item_id"]
required_base = [c for c in required_base if c in df.columns]
df = df.dropna(subset=required_base).copy()

df = df.dropna(subset=[b for b in BANDS if b in df.columns]).copy()

for b in [x for x in BANDS if x in df.columns]:
    df = df[(df[b] >= 0) & (df[b] <= 1.5)]

eps = 1e-12

df["NDVI"] = (df["B08"] - df["B04"]) / (df["B08"] + df["B04"] + eps)
df["NDWI"] = (df["B03"] - df["B08"]) / (df["B03"] + df["B08"] + eps)
df["NDMI"] = (df["B08"] - df["B11"]) / (df["B08"] + df["B11"] + eps)

for c in IDX:
    df[c] = df[c].clip(-1, 1)

# --- Deduplicate: keep "best" observation per (lat, lon, sample date) ---
# prefer lowest cloud, then earliest s2_datetime
sort_cols = []
if "s2_cloud" in df.columns: sort_cols.append("s2_cloud")
if "s2_datetime" in df.columns: sort_cols.append("s2_datetime")
if sort_cols:
    df = df.sort_values(sort_cols)

df = df.drop_duplicates(subset=["Latitude","Longitude","Sample Date"], keep="first").reset_index(drop=True)

keep_cols = ["Latitude","Longitude","Sample Date","s2_item_id","s2_datetime","s2_cloud"] + BANDS + IDX
keep_cols = [c for c in keep_cols if c in df.columns]
df = df[keep_cols].copy()

df.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH, "| rows:", len(df))
df.head()

Saved: sentinel2_features_clean.csv | rows: 59


Unnamed: 0,Latitude,Longitude,Sample Date,s2_item_id,s2_datetime,s2_cloud,B02,B03,B04,B08,B11,B12,NDVI,NDWI,NDMI
0,-24.670833,28.560833,2015-05-08,S2A_MSIL2A_20150811T081656_R135_T35JPN_2021041...,2015-08-11 08:16:56.027000+00:00,0.0,1.0,0.005554,0.02209,0.038049,0.043642,0.038878,0.265366,-0.745242,-0.068458
1,-25.810483,27.909552,2015-05-08,S2A_MSIL2A_20150811T081656_R135_T35JNM_2021041...,2015-08-11 08:16:56.027000+00:00,0.000123,1.0,1.0,1.0,0.003469,0.014128,0.012044,-0.993086,0.993086,-0.605746
2,-24.282222,28.090278,2015-10-09,S2A_MSIL2A_20150913T080256_R035_T35KPP_2021041...,2015-09-13 08:02:56.029000+00:00,0.000954,0.069428,0.081252,0.09171,0.146683,0.237208,0.155956,0.230601,-0.28706,-0.235809
3,-25.127778,27.628889,2015-08-09,S2A_MSIL2A_20150913T080256_R035_T35JNN_2021041...,2015-09-13 08:02:56.029000+00:00,0.002133,0.080288,0.095675,0.121215,0.177833,0.302792,0.2695,0.18933,-0.300387,-0.259991
4,-25.20639,27.558,2015-08-09,S2A_MSIL2A_20150913T080256_R035_T35JNN_2021041...,2015-09-13 08:02:56.029000+00:00,0.002133,0.08019,0.096103,0.107458,0.160269,0.206189,0.159703,0.19726,-0.250287,-0.125306
