In [1]:
# --- Setup ---
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!ls "/content/drive/Shareddrives/BioSCape_Landcover_CNN/capstone_fall_2025"

results_temp  sample_boxes_shapefile.zip  test_set.csv	train_set.csv


In [4]:
# --- Load data ---
# Replace with your paths or use from google.colab import files; files.upload()
EMBED_PATH = '/content/drive/Shareddrives/BioSCape_Landcover_CNN/capstone_fall_2025/results_temp/embedding_px_old.csv'
TRAIN_PATH  = '/content/drive/Shareddrives/BioSCape_Landcover_CNN/capstone_fall_2025/train_set.csv'      # your train_set (with Sample_num + label column)
TEST_PATH = '/content/drive/Shareddrives/BioSCape_Landcover_CNN/capstone_fall_2025/test_set.csv'

df = pd.read_csv(EMBED_PATH)
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [5]:
print(train)

      Sample_num                FinalClass
0              2     Unconsolidated Barren
1              3     Unconsolidated Barren
2              4  Annual Crops (e.g wheat)
3              5  Annual Crops (e.g wheat)
4              6  Annual Crops (e.g wheat)
...          ...                       ...
1936       28460                  Wetlands
1937       28469   Mixed or Not Classified
1938       28480                  Wetlands
1939       28494                  Wetlands
1940       28497   Mixed or Not Classified

[1941 rows x 2 columns]


In [6]:
print(test)

     Sample_num                FinalClass
0             1     Unconsolidated Barren
1             7  Annual Crops (e.g wheat)
2             8  Annual Crops (e.g wheat)
3            12  Annual Crops (e.g wheat)
4            17  Annual Crops (e.g wheat)
..          ...                       ...
476       28310                  Wetlands
477       28374                  Wetlands
478       28413                  Wetlands
479       28496                  Wetlands
480       28499       Natural Wooded Land

[481 rows x 2 columns]


In [7]:
print(df)

                  system:index       A00       A01       A02       A03  \
0       000000000000000003f0_0  0.179377 -0.008858 -0.079723 -0.206936   
1       000000000000000003f0_1  0.179377 -0.008858 -0.079723 -0.206936   
2       000000000000000003f0_2  0.179377 -0.015748 -0.084214 -0.206936   
3       000000000000000003f0_3  0.179377 -0.027128 -0.084214 -0.199862   
4       000000000000000003f0_4  0.179377 -0.027128 -0.079723 -0.206936   
...                        ...       ...       ...       ...       ...   
73003  000000000000000003da_25 -0.022207 -0.084214  0.119093  0.010396   
73004  000000000000000003da_26 -0.038447 -0.088827  0.130165  0.035433   
73005  000000000000000003da_27 -0.038447 -0.088827  0.130165  0.035433   
73006  000000000000000003da_28 -0.048228 -0.088827  0.124567  0.038447   
73007  000000000000000003da_29 -0.075356 -0.113741  0.130165  0.051734   

            A04       A05       A06       A07       A08  ...  \
0     -0.071111 -0.160000 -0.084214  0.221453  

In [None]:
import pandas as pd

# Preserve first 65 fields (A0 - A64) & Smpl_nm
subset = pd.concat([df.iloc[:, :65], df.iloc[:, [72]]], axis=1)

# Rename 'smpl_num' to 'sample_num' if it exists
subset = subset.rename(columns={'Smpl_nm': 'Sample_num'})

# Save the updated CSV
subset.to_csv('embeddings.csv', index=False)

embeddings_df = subset
embeddings_df.to_csv('/content/drive/Shareddrives/BioSCape_Landcover_CNN/capstone_fall_2025/results_temp/embeddings_concat.csv', index=False)

In [None]:
# Merge labels with training data
merged_df = pd.merge(embeddings_df, train, on='Sample_num', how='left')
filtered_df = merged_df[
    (merged_df['FinalClass'].notna()) &
    (merged_df['FinalClass'] != 'Mixed or Not Classified')
]

# Preview the result
print(filtered_df.head())

# Optionally save to CSV
filtered_df.to_csv('/content/drive/Shareddrives/BioSCape_Landcover_CNN/capstone_fall_2025/results_temp/trainingdata_labeled.csv', index=False)

             system:index       A00       A01       A02       A03       A04  \
0  000000000000000003f0_0  0.179377 -0.008858 -0.079723 -0.206936 -0.071111   
1  000000000000000003f0_1  0.179377 -0.008858 -0.079723 -0.206936 -0.071111   
2  000000000000000003f0_2  0.179377 -0.015748 -0.084214 -0.206936 -0.071111   
3  000000000000000003f0_3  0.179377 -0.027128 -0.084214 -0.199862 -0.071111   
4  000000000000000003f0_4  0.179377 -0.027128 -0.079723 -0.206936 -0.075356   

        A05       A06       A07       A08  ...       A56       A57       A58  \
0 -0.160000 -0.084214  0.221453  0.044844  ... -0.124567  0.032541 -0.135886   
1 -0.160000 -0.084214  0.221453  0.044844  ... -0.124567  0.032541 -0.135886   
2 -0.160000 -0.084214  0.206936  0.048228  ... -0.135886  0.022207 -0.135886   
3 -0.186082 -0.108512  0.206936  0.059116  ... -0.130165  0.004983 -0.135886   
4 -0.192910 -0.103406  0.199862  0.062991  ... -0.130165  0.000984 -0.130165   

        A59       A60       A61       A62   

In [None]:
# Merge labels with testing data
merged_df_test = pd.merge(embeddings_df, test, on='Sample_num', how='left')
filtered_df_test = merged_df_test[
    (merged_df_test['FinalClass'].notna()) &
    (merged_df_test['FinalClass'] != 'Mixed or Not Classified')
]

# Preview the result
print(filtered_df_test.head())

# Optionally save to CSV
filtered_df_test.to_csv('/content/drive/Shareddrives/BioSCape_Landcover_CNN/capstone_fall_2025/results_temp/testdata_labeled.csv', index=False)

               system:index       A00       A01       A02       A03       A04  \
150  000000000000000003b6_0  0.160000 -0.004983 -0.012057 -0.199862 -0.088827   
151  000000000000000003b6_1  0.166336  0.007443 -0.015748 -0.186082 -0.071111   
152  000000000000000003b6_2  0.166336  0.007443 -0.015748 -0.186082 -0.071111   
153  000000000000000003b6_3  0.160000  0.004983 -0.017778 -0.179377 -0.071111   
154  000000000000000003b6_4  0.153787  0.003014 -0.035433 -0.172795 -0.059116   

          A05       A06       A07       A08  ...       A56       A57  \
150 -0.199862 -0.066990  0.259900  0.103406  ... -0.038447 -0.027128   
151 -0.186082 -0.075356  0.251965  0.124567  ... -0.044844 -0.024606   
152 -0.186082 -0.075356  0.251965  0.124567  ... -0.044844 -0.024606   
153 -0.192910 -0.075356  0.259900  0.130165  ... -0.041584 -0.027128   
154 -0.206936 -0.093564  0.259900  0.130165  ... -0.035433 -0.027128   

          A58       A59       A60       A61       A62       A63  Sample_num  \
1

In [8]:

# --- your existing prep ---
# Preserve first 65 fields (A0 - A64) & Smpl_nm
subset = pd.concat([df.iloc[:, :65], df.iloc[:, [72]]], axis=1)

# Rename 'smpl_num' to 'sample_num' if it exists
subset = subset.rename(columns={'Smpl_nm': 'Sample_num'})

# Save the updated CSV
subset.to_csv('embeddings.csv', index=False)

embeddings_df = subset
# Merge labels with training data
merged_df = pd.merge(embeddings_df, train, on='Sample_num', how='left')
filtered_df = merged_df[
    (merged_df['FinalClass'].notna()) &
    (merged_df['FinalClass'] != 'Mixed or Not Classified')
].copy()

In [28]:
import re
import numpy as np
import pandas as pd

from sklearn.covariance import LedoitWolf
from sklearn.decomposition import PCA

# ---------------- CONFIG ----------------
ENABLE_PCA   = True    # set False to skip PCA
N_COMPONENTS = 8       # k << ~30 pixels per shape
ROBUST_Z     = 7     # robust “sigma” threshold for outliers in each shape
# ----------------------------------------

df_in = filtered_df.copy()

# Robust grouping key
df_in["Sample_num"] = df_in["Sample_num"].astype(str).str.replace(r"\.0$", "", regex=True)

# Identify embedding columns A00..A63
EMB_COLS = sorted([c for c in df_in.columns if re.fullmatch(r"A\d{2}", c)])
assert len(EMB_COLS) == 64, f"Expected 64 embedding dims, found {len(EMB_COLS)}"

# Matrix of all embeddings
X_all = df_in[EMB_COLS].to_numpy(dtype=float)

# Optional PCA for stability and noise reduction
if ENABLE_PCA:
    pca = PCA(n_components=N_COMPONENTS, svd_solver="auto", random_state=0)
    Z_all = pca.fit_transform(X_all)  # (N, k)
    k = Z_all.shape[1]
else:
    Z_all = X_all
    k = Z_all.shape[1]

# Attach Z to the frame to avoid re-transforming later
Z_cols = [f"Z{str(i).zfill(2)}" for i in range(k)]
dfZ = pd.DataFrame(Z_all, index=df_in.index, columns=Z_cols)
df_in = pd.concat([df_in, dfZ], axis=1)

# ---- Per-class covariance (LedoitWolf) in Z-space ----
precisions_class = {}
for cls, sub in df_in.groupby("FinalClass"):
    Z_sub = sub[Z_cols].to_numpy()
    # Guard against tiny classes
    if Z_sub.shape[0] < k + 2:
        # fall back on global later
        continue
    lw = LedoitWolf().fit(Z_sub)
    precisions_class[cls] = lw.precision_

# Global fallback covariance if a class is too small
lw_global = LedoitWolf().fit(Z_all)
Prec_global = lw_global.precision_


def mahal_d2_to_group_mean_Z(g: pd.DataFrame) -> pd.DataFrame:
    """
    For one shape (one Sample_num), compute squared Mahalanobis distances in Z-space
    to the shape's mean, using the per-class precision matrix when available.
    Drop only those pixels that are robust outliers within that shape.
    """
    # If the group is very small, don't try to be clever: keep everything
    if g.shape[0] < 5:
        out = g.copy()
        out["mahalanobis_sq"] = 0.0
        return out

    Zg = g[Z_cols].to_numpy()
    mu = Zg.mean(axis=0, keepdims=True)      # (1, k)
    Zc = Zg - mu                             # (n, k)

    cls = g["FinalClass"].iloc[0]
    Prec = precisions_class.get(cls, Prec_global)

    # d^2 = (z - μ)^T Prec (z - μ)
    d2 = np.einsum("ij,jk,ik->i", Zc, Prec, Zc)

    # Robust shape-level threshold: median + ROBUST_Z * MAD
    med = np.median(d2)
    mad = np.median(np.abs(d2 - med))
    # If MAD is zero (all nearly identical), don't drop anything
    if mad == 0 or not np.isfinite(mad):
        keep = np.ones_like(d2, dtype=bool)
    else:
        z_robust = np.abs(d2 - med) / (mad + 1e-12)
        keep = z_robust <= ROBUST_Z

    out = g.loc[keep].copy()
    out["mahalanobis_sq"] = d2[keep]
    return out


# ---- Apply per shape ----
clean_df = (
    df_in
    .groupby("Sample_num", group_keys=False)
    .apply(lambda g: mahal_d2_to_group_mean_Z(g).assign(Sample_num=g.name))
    .reset_index(drop=True)
)

# ---- QA summary (before vs after) ----
before = (
    df_in
    .groupby(["Sample_num", "FinalClass"])
    .size()
    .rename("n_before")
    .to_frame()
)

after = (
    clean_df
    .groupby(["Sample_num", "FinalClass"])
    .size()
    .rename("n_after")
    .to_frame()
)

qa_summary = (
    before.join(after, how="left")
    .fillna({"n_after": 0})
    .astype({"n_before": int, "n_after": int})
    .assign(
        dropped_pixels=lambda x: x["n_before"] - x["n_after"],
        kept_pixels=lambda x: x["n_after"],
        kept_ratio=lambda x: x["n_after"] / x["n_before"]
    )
    .reset_index()
)

# Per-shape mean after filtering (for downstream classification)
per_shape_mean = (
    clean_df
    .groupby(["Sample_num", "FinalClass"], as_index=False)[EMB_COLS]
    .mean()
)

# ---- Some quick prints ----
print(qa_summary.head())
total_before = qa_summary["n_before"].sum()
total_after  = qa_summary["n_after"].sum()
total_dropped = total_before - total_after

print("Total dropped pixels:", total_dropped)
num_boxes_2plus = (qa_summary["dropped_pixels"] >= 2).sum()
print("Number of boxes with 2 or more dropped pixels:", num_boxes_2plus)
num_boxes_3plus = (qa_summary["dropped_pixels"] >= 3).sum()
print("Number of boxes with 3 or more dropped pixels:", num_boxes_3plus)
num_boxes_4plus = (qa_summary["dropped_pixels"] >= 4).sum()
print("Number of boxes with 4 or more dropped pixels:", num_boxes_4plus)
num_boxes_5plus = (qa_summary["dropped_pixels"] >= 5).sum()
print("Number of boxes with 5 or more dropped pixels:", num_boxes_5plus)
boxes_5plus = qa_summary[qa_summary["dropped_pixels"] >= 5]
print(boxes_5plus)
print(f"Total before filtering: {total_before}")
print(f"Total after filtering:  {total_after}")
print(f"Total dropped pixels:   {total_dropped}")


  Sample_num                FinalClass  n_before  n_after  dropped_pixels  \
0         10  Annual Crops (e.g wheat)        30       30               0   
1      10046               Waterbodies        31       31               0   
2      10104               Waterbodies        30       29               1   
3      10163               Waterbodies        31       31               0   
4      10171               Waterbodies        30       30               0   

   kept_pixels  kept_ratio  
0           30    1.000000  
1           31    1.000000  
2           29    0.966667  
3           31    1.000000  
4           30    1.000000  
Total dropped pixels: 1065
Number of boxes with 2 or more dropped pixels: 285
Number of boxes with 3 or more dropped pixels: 136
Number of boxes with 4 or more dropped pixels: 61
Number of boxes with 5 or more dropped pixels: 20
     Sample_num                              FinalClass  n_before  n_after  \
65        10701                             Waterbodies 

  .apply(lambda g: mahal_d2_to_group_mean_Z(g).assign(Sample_num=g.name))


In [None]:
print(filtered_df)

                  system:index       A00       A01       A02       A03  \
0       000000000000000003f0_0  0.179377 -0.008858 -0.079723 -0.206936   
1       000000000000000003f0_1  0.179377 -0.008858 -0.079723 -0.206936   
2       000000000000000003f0_2  0.179377 -0.015748 -0.084214 -0.206936   
3       000000000000000003f0_3  0.179377 -0.027128 -0.084214 -0.199862   
4       000000000000000003f0_4  0.179377 -0.027128 -0.079723 -0.206936   
...                        ...       ...       ...       ...       ...   
72942  00000000000000000362_25 -0.017778 -0.130165  0.214133  0.027128   
72943  00000000000000000362_26 -0.048228 -0.147697  0.199862  0.041584   
72944  00000000000000000362_27 -0.048228 -0.147697  0.199862  0.041584   
72945  00000000000000000362_28 -0.088827 -0.179377  0.172795  0.098424   
72946  00000000000000000362_29 -0.108512 -0.179377  0.160000  0.103406   

            A04       A05       A06       A07       A08  ...       A56  \
0     -0.071111 -0.160000 -0.084214  

In [None]:
value_counts = filtered_df['Sample_num'].value_counts()

# Display the result
print(value_counts)

Sample_num
5524.0    34
4469.0    33
7301.0    33
4946.0    33
6746.0    33
          ..
4.0       27
33.0      27
7436.0    27
5493.0    27
46.0      27
Name: count, Length: 1483, dtype: int64


In [None]:
# --- (Training) Features / target / groups ---
band_cols = [c for c in filtered_df.columns if c.startswith('A') and len(c)==3]  # 'A00'..'A63'
if not band_cols:
    # fallback: select all A(XX) columns
    band_cols = [c for c in filtered_df.columns if c.startswith('A')]

X = filtered_df[band_cols].values.astype(np.float32)
y_text = filtered_df['FinalClass'].astype(str).values
groups = filtered_df['Sample_num'].values  # group by sample box

# Encode labels to integers
le = LabelEncoder()
y = le.fit_transform(y_text)
n_classes = len(le.classes_)
print("Classes mapping:", dict(zip(le.classes_, range(n_classes))))

Classes mapping: {'Annual Crops (e.g wheat)': 0, 'Built-up': 1, 'Consolidated Barren (rocks, salt pans)': 2, 'Natural Grassland': 3, 'Natural Wooded Land': 4, 'Permanent Crops (e.g., vineyard)': 5, 'Planted Forest': 6, 'Shrubs': 7, 'Unconsolidated Barren': 8, 'Waterbodies': 9, 'Wetlands': 10}


In [None]:
# --- (Testing) Features / targets / groups ---
band_cols_test = [c for c in filtered_df_test if c.startswith('A') and len(c)==3] # 'A00'..'A63'
if not band_cols_test:
  # fallback: select all A(XX) columns
  band_cols_test = [c for c in filtered_df_test.columns if c.startswith('A')]

X_test = filtered_df_test[band_cols_test].values.astype(np.float32)
y_text_test = filtered_df_test['FinalClass'].astype(str).values
groups_test = filtered_df_test['Sample_num'].values  # group by sample box

# Encode labels to integers
le = LabelEncoder()
y_test = le.fit_transform(y_text_test)
n_classes = len(le.classes_)
print("Classes mapping:", dict(zip(le.classes_, range(n_classes))))

Classes mapping: {'Annual Crops (e.g wheat)': 0, 'Built-up': 1, 'Consolidated Barren (rocks, salt pans)': 2, 'Natural Grassland': 3, 'Natural Wooded Land': 4, 'Permanent Crops (e.g., vineyard)': 5, 'Planted Forest': 6, 'Shrubs': 7, 'Unconsolidated Barren': 8, 'Waterbodies': 9, 'Wetlands': 10}


In [None]:
# --- K-fold setup (Stratified by label, grouped by Smpl_nm) ---
skf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

def build_mlp(input_dim):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.BatchNormalization(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
    ])
    return model

def build_cnn1d(input_dim, n_classes):
    model = models.Sequential([
        layers.Input(shape=(input_dim, 1)),
        layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(n_classes, activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
# OPTIONAL: a 1D-CNN variant (swap build_mlp with this if you want a CNN)
def build_cnn1d(input_dim, n_classes):
    model = models.Sequential([
        layers.Input(shape=(input_dim, 1)),
        layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(n_classes, activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

from sklearn.svm import SVC

# Fit training data
accs, f1s = [], []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y, groups), 1):
    X_tr, X_va = X[train_idx], X[val_idx]
    y_tr, y_va = y[train_idx], y[val_idx]

    # Standardize per-fold (fit on train only)
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    X_va = scaler.transform(X_va)

    # Build the MLP model for feature extraction
    feature_extractor = build_mlp(X.shape[1])

    # Extract features
    X_tr_features = feature_extractor.predict(X_tr)
    X_va_features = feature_extractor.predict(X_va)

    # Train SVM on extracted features
    svm_model = SVC(probability=True) # Set probability=True for class weight
    # Class weights (optional but helpful if imbalanced)
    classes = np.unique(y_tr)
    cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_tr)
    class_weight = {int(k): float(v) for k, v in zip(classes, cw)}
    svm_model.fit(X_tr_features, y_tr, sample_weight=compute_class_weight(class_weight='balanced', classes=np.unique(y_tr), y=y_tr)[y_tr])


    # Evaluate SVM
    y_pred = svm_model.predict(X_va_features)
    acc = accuracy_score(y_va, y_pred)
    f1 = f1_score(y_va, y_pred, average='macro')
    accs.append(acc); f1s.append(f1)

    print(f"[Fold {fold}]  acc={acc:.4f}  macroF1={f1:.4f}")
    print(classification_report(y_va, y_pred, target_names=le.classes_))

print("==== CV Summary ====")
print(f"Mean Acc: {np.mean(accs):.4f} ± {np.std(accs):.4f}")
print(f"Mean F1 : {np.mean(f1s):.4f} ± {np.std(f1s):.4f}")

[1m1118/1118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[Fold 1]  acc=0.7201  macroF1=0.7210
                                        precision    recall  f1-score   support

              Annual Crops (e.g wheat)       0.83      0.82      0.83      1228
                              Built-up       0.93      1.00      0.97       423
Consolidated Barren (rocks, salt pans)       0.80      0.76      0.78       302
                     Natural Grassland       0.27      0.47      0.34       601
                   Natural Wooded Land       0.77      0.61      0.68      1238
      Permanent Crops (e.g., vineyard)       0.83      0.69      0.75       629
                        Planted Forest       0.64      0.72      0.68       603
                                Shrubs       0.61      0.57      0.59      1562
                 Unconsolidated Barren       0.87      0.82      0.84       326
          

In [None]:
# Fit testing data
accs, f1s = [], []

for fold, (test_idx, val_idx) in enumerate(skf.split(X_test, y_test, groups_test), 1):
    X_ts, X_va = X_test[test_idx], X_test[val_idx]
    y, y_va = y_test[test_idx], y_test[val_idx]

    # Standardize per-fold (fit on train only)
    scaler = StandardScaler()
    X_ts = scaler.fit_transform(X_ts)
    X_va = scaler.transform(X_va)

    # Build the MLP model for feature extraction
    feature_extractor = build_mlp(X_test.shape[1])

    # Extract features
    X_ts_features = feature_extractor.predict(X_ts)
    X_va_features = feature_extractor.predict(X_va)

    # Train SVM on extracted features
    svm_model = SVC(probability=True) # Set probability=True for class weight
    # Class weights (optional but helpful if imbalanced)
    classes = np.unique(y)
    cw = compute_class_weight(class_weight='balanced', classes=classes, y=y)
    class_weight = {int(k): float(v) for k, v in zip(classes, cw)}
    svm_model.fit(X_ts_features, y, sample_weight=compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)[y])


    # Evaluate SVM
    y_pred = svm_model.predict(X_va_features)
    acc = accuracy_score(y_va, y_pred)
    f1 = f1_score(y_va, y_pred, average='macro')
    accs.append(acc); f1s.append(f1)

    print(f"[Fold {fold}]  acc={acc:.4f}  macroF1={f1:.4f}")
    print(classification_report(y_va, y_pred, target_names=le.classes_))

print("==== CV Summary ====")
print(f"Mean Acc: {np.mean(accs):.4f} ± {np.std(accs):.4f}")
print(f"Mean F1 : {np.mean(f1s):.4f} ± {np.std(f1s):.4f}")

[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[Fold 1]  acc=0.8257  macroF1=0.7130
                                        precision    recall  f1-score   support

              Annual Crops (e.g wheat)       0.99      0.82      0.90       274
                              Built-up       0.99      1.00      0.99       180
Consolidated Barren (rocks, salt pans)       0.00      0.00      0.00        30
                     Natural Grassland       0.36      0.28      0.31       121
                   Natural Wooded Land       0.51      0.91      0.65       151
      Permanent Crops (e.g., vineyard)       1.00      0.87      0.93       302
                        Planted Forest       0.85      0.46      0.60       152
                                Shrubs       0.71      0.86      0.77       327
                 Unconsolidated Barren       1.00      1.00      1.00       121
              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
[Fold 2]  acc=0.6292  macroF1=0.6258
                                        precision    recall  f1-score   support

              Annual Crops (e.g wheat)       0.79      0.74      0.76       240
                              Built-up       1.00      0.52      0.69        92
Consolidated Barren (rocks, salt pans)       1.00      0.52      0.68       121
                     Natural Grassland       0.04      0.02      0.02       332
                   Natural Wooded Land       0.44      0.70      0.54       210
      Permanent Crops (e.g., vineyard)       0.59      0.99      0.74       151
                        Planted Forest       0.61      0.30      0.41       151
                                Shrubs       0.56      0.80      0.66       479
                 Unconsolidated Barren       0.95      1.00      0.98       120
             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
[Fold 5]  acc=0.7025  macroF1=0.6795
                                        precision    recall  f1-score   support

              Annual Crops (e.g wheat)       1.00      0.85      0.92       390
                              Built-up       1.00      1.00      1.00       155
Consolidated Barren (rocks, salt pans)       1.00      0.44      0.61       120
                     Natural Grassland       0.07      0.30      0.12        60
                   Natural Wooded Land       0.73      0.45      0.55       304
      Permanent Crops (e.g., vineyard)       1.00      0.75      0.86       119
                        Planted Forest       0.58      0.70      0.63       151
                                Shrubs       0.66      0.73      0.70       572
                 Unconsolidated Barren       1.00      0.67      0.80        90
             

In [None]:
# --- Save final model trained on full data (optional) ---
# Refit scaler on all data, train for a few epochs:
scaler_full = StandardScaler().fit(X)
X_full = scaler_full.transform(X)
if USE_CNN:
    model_full = build_cnn1d(X.shape[1], n_classes)
    X_full_in = X_full[..., None]
else:
    model_full = build_mlp(X.shape[1], n_classes)
    X_full_in = X_full

model_full.fit(X_full_in, y, epochs=20, batch_size=256, verbose=0, class_weight=class_weight)
model_full.save('/content/embedding_mlp.h5')
import joblib, os
joblib.dump({'scaler': scaler_full, 'label_encoder': le}, '/content/embedding_mlp_assets.pkl')
print("Saved /content/embedding_mlp.h5 and /content/embedding_mlp_assets.pkl")



Saved /content/embedding_mlp.h5 and /content/embedding_mlp_assets.pkl
