In [None]:
# --- Setup ---
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls "/content/drive/Shareddrives/BioSCape_Landcover_CNN/capstone_fall_2025"

results_temp  sample_boxes_shapefile.zip  test_set.csv	train_set.csv


In [52]:
# --- Load data ---
# Replace with your paths or use from google.colab import files; files.upload()
EMBED_PATH = '/content/drive/Shareddrives/BioSCape_Landcover_CNN/capstone_fall_2025/results_temp/embedding_px_old.csv'
TRAIN_PATH  = '/content/drive/Shareddrives/BioSCape_Landcover_CNN/capstone_fall_2025/train_set.csv'      # your train_set (with Sample_num + label column)

df = pd.read_csv(EMBED_PATH)
train = pd.read_csv(TRAIN_PATH)

In [57]:
print(train)

      Sample_num                FinalClass Sample_num_str
0              2     Unconsolidated Barren              2
1              3     Unconsolidated Barren              3
2              4  Annual Crops (e.g wheat)              4
3              5  Annual Crops (e.g wheat)              5
4              6  Annual Crops (e.g wheat)              6
...          ...                       ...            ...
1936       28460                  Wetlands          28460
1937       28469   Mixed or Not Classified          28469
1938       28480                  Wetlands          28480
1939       28494                  Wetlands          28494
1940       28497   Mixed or Not Classified          28497

[1941 rows x 3 columns]


In [54]:
print(df)

                  system:index       A00       A01       A02       A03  \
0       000000000000000003f0_0  0.179377 -0.008858 -0.079723 -0.206936   
1       000000000000000003f0_1  0.179377 -0.008858 -0.079723 -0.206936   
2       000000000000000003f0_2  0.179377 -0.015748 -0.084214 -0.206936   
3       000000000000000003f0_3  0.179377 -0.027128 -0.084214 -0.199862   
4       000000000000000003f0_4  0.179377 -0.027128 -0.079723 -0.206936   
...                        ...       ...       ...       ...       ...   
73003  000000000000000003da_25 -0.022207 -0.084214  0.119093  0.010396   
73004  000000000000000003da_26 -0.038447 -0.088827  0.130165  0.035433   
73005  000000000000000003da_27 -0.038447 -0.088827  0.130165  0.035433   
73006  000000000000000003da_28 -0.048228 -0.088827  0.124567  0.038447   
73007  000000000000000003da_29 -0.075356 -0.113741  0.130165  0.051734   

            A04       A05       A06       A07       A08  ...  \
0     -0.071111 -0.160000 -0.084214  0.221453  

In [None]:
import pandas as pd
import numpy as np
import re

# Fresh copies (avoid reusing previously filtered dfs)
df = pd.read_csv(EMBED_PATH)
train = pd.read_csv(TRAIN_PATH)

# --- 1) Build a common numeric key: sample_id ---
def make_sample_id(s):
    s = s.astype(str).str.strip()
    # keep digits only; handles "000123", "S-00123", etc. -> 123
    s = s.str.replace(r'\D+', '', regex=True)
    return pd.to_numeric(s, errors='coerce').astype('Int64')

df['sample_id'] = make_sample_id(df['Smpl_nm'])
train['sample_id'] = make_sample_id(train['Sample_num'])

# --- 2) Filter out Mixed / Not Classified (case-insensitive, robust) ---
lab = train['FinalClass'].astype(str).str.strip()
bad_mask = lab.str.lower().str.contains(r'\bmixed\b') | lab.str.lower().str.contains(r'\bnot\s*classified\b')
train_filt = train.loc[~bad_mask, ['sample_id', 'FinalClass']].copy()

# Quick diagnostics (optional but very helpful)
print("Embeddings unique sample_id:", df['sample_id'].nunique(), "rows:", len(df))
print("Train (kept) unique sample_id:", train_filt['sample_id'].nunique(), "rows:", len(train_filt))
inter = np.intersect1d(df['sample_id'].dropna().unique(),
                       train_filt['sample_id'].dropna().unique())
print("ID intersection size:", len(inter))

# --- 3) Merge on numeric key ---
# Changed merge type to 'left' to keep all rows from df
merged = df.merge(train_filt, on='sample_id', how='left').rename(columns={'FinalClass':'Label'})

print("Merged rows:", len(merged), " Unique samples:", merged['sample_id'].nunique())
print(merged['Label'].value_counts())

# The df dataframe still contains all original rows, so this print statement
# might not be what you expect after the merge.
# If you want to see the value counts of the 'Label' column in the merged
# dataframe, use 'merged' instead of 'df'.
print("df rows:", len(df), " Unique samples:", df['sample_id'].nunique())
# The line below caused an error because 'df' does not have a 'Label' column
# print(df['Label'].value_counts())

Embeddings unique sample_id: 2423 rows: 73008
Train (kept) unique sample_id: 1483 rows: 1483
ID intersection size: 20
Merged rows: 73008  Unique samples: 2423
Label
Shrubs                                    232
Annual Crops (e.g wheat)                   86
Wetlands                                   60
Natural Wooded Land                        59
Planted Forest                             31
Built-up                                   30
Natural Grassland                          29
Consolidated Barren (rocks, salt pans)     29
Permanent Crops (e.g., vineyard)           29
Name: count, dtype: int64
df rows: 73008  Unique samples: 2423


In [66]:
# --- Create a new dataframe with merged and filtered data ---

# 1. Normalize Smpl_nm and Sample_num for robust merging
#df['Smpl_nm_str'] = df['Smpl_nm'].astype(str).str.strip()
#train['Sample_num_str'] = train['Sample_num'].astype(str).str.strip()

display(df.head())
display(train.head())

# 2. Filter out "Mixed or Not Classified" from the train data
train_filtered = train[
    ~train['FinalClass'].str.contains('Mixed or Not Classified', case=False, na=False)
].copy()

display(train_filtered.head())

# 3. Merge df with the filtered train data
# Use an inner merge to keep only rows where Smpl_nm matches Sample_num
merged_filtered_df = df.merge(
    train_filtered[['Sample_num', 'FinalClass']],
    left_on='Smpl_nm',
    right_on='Sample_num',
    how='inner'
)
display(merged_filtered_df)

# 4. Drop the temporary merge columns and rename the label column
#merged_filtered_df = merged_filtered_df.drop(columns=['Smpl_nm_str', 'Sample_num_str'])
merged_filtered_df = merged_filtered_df.rename(columns={'FinalClass': 'Label'})

print("Rows in the new merged and filtered dataframe:", len(merged_filtered_df))
print("Unique sample_ids in the new dataframe:", merged_filtered_df['Smpl_nm'].nunique())
print("Label distribution in the new dataframe:")
print(merged_filtered_df['Label'].value_counts())

# Display the first few rows of the new dataframe
display(merged_filtered_df.head())

Unnamed: 0,system:index,A00,A01,A02,A03,A04,A05,A06,A07,A08,...,SALCC_1,SALCC_2,Shap_Ar,Shp_Lng,Smpl_nm,box_nr,instrmn,men_rsl,.geo,Smpl_nm_str
0,000000000000000003f0_0,0.179377,-0.008858,-0.079723,-0.206936,-0.071111,-0.16,-0.084214,0.221453,0.044844,...,Forested Land,Natural Wooded Land,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0
1,000000000000000003f0_1,0.179377,-0.008858,-0.079723,-0.206936,-0.071111,-0.16,-0.084214,0.221453,0.044844,...,Forested Land,Natural Wooded Land,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0
2,000000000000000003f0_2,0.179377,-0.015748,-0.084214,-0.206936,-0.071111,-0.16,-0.084214,0.206936,0.048228,...,Forested Land,Natural Wooded Land,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0
3,000000000000000003f0_3,0.179377,-0.027128,-0.084214,-0.199862,-0.071111,-0.186082,-0.108512,0.206936,0.059116,...,Forested Land,Natural Wooded Land,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0
4,000000000000000003f0_4,0.179377,-0.027128,-0.079723,-0.206936,-0.075356,-0.19291,-0.103406,0.199862,0.062991,...,Forested Land,Natural Wooded Land,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0


Unnamed: 0,Sample_num,FinalClass,Sample_num_str
0,2,Unconsolidated Barren,2
1,3,Unconsolidated Barren,3
2,4,Annual Crops (e.g wheat),4
3,5,Annual Crops (e.g wheat),5
4,6,Annual Crops (e.g wheat),6


Unnamed: 0,Sample_num,FinalClass,Sample_num_str
0,2,Unconsolidated Barren,2
1,3,Unconsolidated Barren,3
2,4,Annual Crops (e.g wheat),4
3,5,Annual Crops (e.g wheat),5
4,6,Annual Crops (e.g wheat),6


Unnamed: 0,system:index,A00,A01,A02,A03,A04,A05,A06,A07,A08,...,Shap_Ar,Shp_Lng,Smpl_nm,box_nr,instrmn,men_rsl,.geo,Smpl_nm_str,Sample_num,FinalClass
0,000000000000000003f0_0,0.179377,-0.008858,-0.079723,-0.206936,-0.071111,-0.160000,-0.084214,0.221453,0.044844,...,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0,6530,Natural Wooded Land
1,000000000000000003f0_1,0.179377,-0.008858,-0.079723,-0.206936,-0.071111,-0.160000,-0.084214,0.221453,0.044844,...,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0,6530,Natural Wooded Land
2,000000000000000003f0_2,0.179377,-0.015748,-0.084214,-0.206936,-0.071111,-0.160000,-0.084214,0.206936,0.048228,...,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0,6530,Natural Wooded Land
3,000000000000000003f0_3,0.179377,-0.027128,-0.084214,-0.199862,-0.071111,-0.186082,-0.108512,0.206936,0.059116,...,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0,6530,Natural Wooded Land
4,000000000000000003f0_4,0.179377,-0.027128,-0.079723,-0.206936,-0.075356,-0.192910,-0.103406,0.199862,0.062991,...,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0,6530,Natural Wooded Land
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44686,00000000000000000362_25,-0.017778,-0.130165,0.214133,0.027128,-0.147697,-0.153787,0.055363,0.228897,-0.166336,...,2500.000000,200.0,5741.0,1.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",5741.0,5741,Built-up
44687,00000000000000000362_26,-0.048228,-0.147697,0.199862,0.041584,-0.147697,-0.179377,0.048228,0.228897,-0.153787,...,2500.000000,200.0,5741.0,1.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",5741.0,5741,Built-up
44688,00000000000000000362_27,-0.048228,-0.147697,0.199862,0.041584,-0.147697,-0.179377,0.048228,0.228897,-0.153787,...,2500.000000,200.0,5741.0,1.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",5741.0,5741,Built-up
44689,00000000000000000362_28,-0.088827,-0.179377,0.172795,0.098424,-0.160000,-0.199862,0.032541,0.251965,-0.147697,...,2500.000000,200.0,5741.0,1.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",5741.0,5741,Built-up


Rows in the new merged and filtered dataframe: 44691
Unique sample_ids in the new dataframe: 1483
Label distribution in the new dataframe:
Label
Shrubs                                    8739
Annual Crops (e.g wheat)                  6681
Waterbodies                               5731
Natural Wooded Land                       5284
Natural Grassland                         3934
Permanent Crops (e.g., vineyard)          3466
Planted Forest                            2808
Built-up                                  2473
Wetlands                                  2173
Unconsolidated Barren                     1867
Consolidated Barren (rocks, salt pans)    1535
Name: count, dtype: int64


Unnamed: 0,system:index,A00,A01,A02,A03,A04,A05,A06,A07,A08,...,Shap_Ar,Shp_Lng,Smpl_nm,box_nr,instrmn,men_rsl,.geo,Smpl_nm_str,Sample_num,Label
0,000000000000000003f0_0,0.179377,-0.008858,-0.079723,-0.206936,-0.071111,-0.16,-0.084214,0.221453,0.044844,...,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0,6530,Natural Wooded Land
1,000000000000000003f0_1,0.179377,-0.008858,-0.079723,-0.206936,-0.071111,-0.16,-0.084214,0.221453,0.044844,...,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0,6530,Natural Wooded Land
2,000000000000000003f0_2,0.179377,-0.015748,-0.084214,-0.206936,-0.071111,-0.16,-0.084214,0.206936,0.048228,...,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0,6530,Natural Wooded Land
3,000000000000000003f0_3,0.179377,-0.027128,-0.084214,-0.199862,-0.071111,-0.186082,-0.108512,0.206936,0.059116,...,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0,6530,Natural Wooded Land
4,000000000000000003f0_4,0.179377,-0.027128,-0.079723,-0.206936,-0.075356,-0.19291,-0.103406,0.199862,0.062991,...,2499.999999,200.0,6530.0,10.0,AVIRIS-NG,5 m,"{""geodesic"":false,""type"":""Point"",""coordinates""...",6530.0,6530,Natural Wooded Land


In [None]:
print(merged['Label'])

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
73003    NaN
73004    NaN
73005    NaN
73006    NaN
73007    NaN
Name: Label, Length: 73008, dtype: object


In [None]:
# --- Join on Smpl_nm == Sample_num ---
# Normalize types to string for a robust join
train['Sample_num'] = train['Sample_num'].astype(str)
df['Smpl_nm'] = df['Smpl_nm'].astype(str)

df = df.merge(train[['Sample_num', LABEL_COL]],
              left_on='Smpl_nm', right_on='Sample_num', how='inner')

In [None]:
# --- Filter out Mixed / Not Classified ---
bad = {'Mixed', 'Not Classified'}
df = df[~df[LABEL_COL].isin(bad)].copy()
df.reset_index(drop=True, inplace=True)

print("Rows after join/filter:", len(df))
print("Classes:", df[LABEL_COL].value_counts().to_dict())

Rows after join/filter: 0
Classes: {}


In [None]:
# --- Features / target / groups ---
# Embedding bands: A00..A63 (or however many you exported)
band_cols = [c for c in df.columns if c.startswith('A') and len(c)==3]  # 'A00'..'A63'
if not band_cols:
    # fallback: pick all Axx columns
    band_cols = [c for c in df.columns if c.startswith('A')]

X = df[band_cols].values.astype(np.float32)
y_text = df[LABEL_COL].astype(str).values
groups = df['Smpl_nm'].values  # group by sample box

# Encode labels to integers
le = LabelEncoder()
y = le.fit_transform(y_text)
n_classes = len(le.classes_)
print("Classes mapping:", dict(zip(le.classes_, range(n_classes))))

In [None]:
# --- K-fold setup (Stratified by label, grouped by Smpl_nm) ---
skf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

def build_mlp(input_dim, n_classes):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.BatchNormalization(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(n_classes, activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model


In [None]:
# OPTIONAL: a 1D-CNN variant (swap build_mlp with this if you want a CNN)
def build_cnn1d(input_dim, n_classes):
    model = models.Sequential([
        layers.Input(shape=(input_dim, 1)),
        layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(n_classes, activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

accs, f1s = [], []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y, groups), 1):
    X_tr, X_va = X[train_idx], X[val_idx]
    y_tr, y_va = y[train_idx], y[val_idx]

    # Standardize per-fold (fit on train only)
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    X_va = scaler.transform(X_va)

    # Choose model: MLP or 1D-CNN
    USE_CNN = False  # set True to try the CNN
    if USE_CNN:
        X_tr_in = X_tr[..., None]
        X_va_in = X_va[..., None]
        model = build_cnn1d(X.shape[1], n_classes)
    else:
        X_tr_in = X_tr
        X_va_in = X_va
        model = build_mlp(X.shape[1], n_classes)

    # Class weights (optional but helpful if imbalanced)
    classes = np.unique(y_tr)
    cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_tr)
    class_weight = {int(k): float(v) for k, v in zip(classes, cw)}

    es = callbacks.EarlyStopping(monitor='val_accuracy', patience=8, restore_best_weights=True)
    rlrop = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_lr=1e-5)

    history = model.fit(
        X_tr_in, y_tr,
        validation_data=(X_va_in, y_va),
        epochs=60,
        batch_size=256,
        verbose=0,
        callbacks=[es, rlrop],
        class_weight=class_weight
    )

    # Evaluate
    y_pred = model.predict(X_va_in, verbose=0).argmax(axis=1)
    acc = accuracy_score(y_va, y_pred)
    f1 = f1_score(y_va, y_pred, average='macro')
    accs.append(acc); f1s.append(f1)

    print(f"[Fold {fold}]  acc={acc:.4f}  macroF1={f1:.4f}")
    print(classification_report(y_va, y_pred, target_names=le.classes_))

print("==== CV Summary ====")
print(f"Mean Acc: {np.mean(accs):.4f} ± {np.std(accs):.4f}")
print(f"Mean F1 : {np.mean(f1s):.4f} ± {np.std(f1s):.4f}")


In [None]:
# --- Save final model trained on full data (optional) ---
# Refit scaler on all data, train for a few epochs:
scaler_full = StandardScaler().fit(X)
X_full = scaler_full.transform(X)
if USE_CNN:
    model_full = build_cnn1d(X.shape[1], n_classes)
    X_full_in = X_full[..., None]
else:
    model_full = build_mlp(X.shape[1], n_classes)
    X_full_in = X_full

model_full.fit(X_full_in, y, epochs=20, batch_size=256, verbose=0, class_weight=class_weight)
model_full.save('/content/embedding_mlp.h5')
import joblib, os
joblib.dump({'scaler': scaler_full, 'label_encoder': le}, '/content/embedding_mlp_assets.pkl')
print("Saved /content/embedding_mlp.h5 and /content/embedding_mlp_assets.pkl")