In [None]:
import numpy as np
import pandas as pd
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import gc
import pickle

# Define the buffer size in meters
BUFFER_METERS = 500

# ==================== 1. Load Data ==================== #
orig = pd.read_csv("../../data/RainySeason.csv")
river_100 = pd.read_csv("../data/Samples_100.csv")
river_100.drop(columns="Source", inplace=True)

drop_cols = ['Stations', 'River', 'Lat', 'Long', 'geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')

# --- IMPUTATION FIX: Fill NaN values with 0 before further processing ---
orig.fillna(0, inplace=True)
river_100.fillna(0, inplace=True)

# Train-test split
np.random.seed(42)
train_orig = orig.sample(10, random_state=42)
test_orig = orig.drop(train_orig.index)
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDW/*.tif")

print(f"Using {len(raster_paths)} raster layers for CNN input.")
for r in raster_paths:
    print("  -", os.path.basename(r))

# ==================== 3. Create a Custom Data Generator ==================== #
def extract_patch_for_generator(coords, raster_files, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height):
    """
    Extracts a batch of patches from rasters for a given set of coordinates.
    This function is optimized to be called by the data generator for each batch.
    """
    patches = []
    # Loop through each coordinate pair in the batch
    for lon, lat in coords:
        channels = []
        # Loop through each raster file to get a single patch for each raster
        for rfile in raster_files:
            with rasterio.open(rfile) as src:
                try:
                    row, col = src.index(lon, lat)
                    win = Window(col - buffer_pixels_x, row - buffer_pixels_y, patch_width, patch_height)
                    arr = src.read(1, window=win, boundless=True, fill_value=0)
                    arr = arr.astype(np.float32)

                    # --- NORMALIZATION FIX: Add a small epsilon to avoid division by zero ---
                    max_val = np.nanmax(arr)
                    if max_val != 0:
                        arr /= max_val + 1e-8 # Add epsilon for stability
                except Exception as e:
                    print(f"Error processing {rfile} for coordinates ({lon}, {lat}): {e}")
                    arr = np.zeros((patch_width, patch_height), dtype=np.float32)
            channels.append(arr)
        patches.append(np.stack(channels, axis=-1))
    
    return np.array(patches)

class DataGenerator(Sequence):
    def __init__(self, coords, mlp_data, gnn_data, y, raster_paths, batch_size=4, shuffle=True, buffer_meters=BUFFER_METERS, **kwargs):
        super().__init__(**kwargs)
        self.coords = coords
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.raster_paths = raster_paths
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.buffer_meters = buffer_meters

        # Pre-calculate patch size from the first raster
        with rasterio.open(raster_paths[0]) as src:
            res_x, res_y = src.res
            self.buffer_pixels_x = int(self.buffer_meters / res_x)
            self.buffer_pixels_y = int(self.buffer_meters / res_y)
            self.patch_width = 2 * self.buffer_pixels_x
            self.patch_height = 2 * self.buffer_pixels_y

        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_coords = self.coords[batch_indices]
        batch_mlp = self.mlp_data[batch_indices]
        
        # Slice the GNN adjacency matrix for the current batch
        batch_gnn = self.gnn_data[batch_indices, :]

        batch_y = self.y[batch_indices]

        # Extract CNN patches for the current batch
        batch_cnn = extract_patch_for_generator(
            batch_coords,
            self.raster_paths,
            self.buffer_pixels_x,
            self.buffer_pixels_y,
            self.patch_width,
            self.patch_height
        )

        # Return a tuple of inputs and the target, which Keras expects
        return (batch_cnn, batch_mlp, batch_gnn), batch_y

# ==================== 4. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined[['Long', 'Lat']].values
coords_test = test_orig[['Long', 'Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
# --- IMPUTATION FIX: Fill NaN in raw MLP data before scaling ---
train_combined.fillna(0, inplace=True)
test_orig.fillna(0, inplace=True)
mlp_train = scaler.fit_transform(train_combined[numeric_cols])
mlp_test = scaler.transform(test_orig[numeric_cols])
y_train = train_combined['RI'].values
y_test = test_orig['RI'].values

# ==================== 5. Define Enhanced CNN–GNN–MLP Model ==================== #
def build_fusion_model(patch_shape, gnn_dim, mlp_dim):
    # CNN branch (for raster data)
    cnn_input = Input(shape=patch_shape, name="cnn_input")
    x = Conv2D(32, (3,3), activation="relu")(cnn_input)
    x = MaxPooling2D((2,2))(x)
    x = Conv2D(64, (3,3), activation="relu")(x)
    x = MaxPooling2D((2,2))(x)
    x = Flatten()(x)
    cnn_out = Dense(128, activation="relu", name="cnn_out")(x)

    # MLP branch (for numerical site features)
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    m = Dense(64, activation="relu")(mlp_input)
    mlp_out = Dense(32, activation="relu", name="mlp_out")(m)

    # GNN branch (for spatial connectivity)
    # The GNN input dimension is now the number of training samples
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    g = Dense(64, activation="relu")(gnn_input)
    gnn_out = Dense(32, activation="relu", name="gnn_out")(g)

    # Fusion Layer
    combined = Concatenate()([cnn_out, mlp_out, gnn_out])
    f = Dense(128, activation="relu")(combined)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)

    model = Model(inputs=[cnn_input, mlp_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

# We need to determine the final GNN input dimension for the model
# It's the total number of training samples
batch_size = 4
gnn_input_dim = len(coords_train)

# Helper function to get CNN patch shape from rasters
def get_cnn_patch_shape(raster_paths, buffer_meters):
    with rasterio.open(raster_paths[0]) as src:
        res_x, _ = src.res
        buffer_pixels = int(buffer_meters / res_x)
        return (2 * buffer_pixels, 2 * buffer_pixels, len(raster_paths))

cnn_patch_shape = get_cnn_patch_shape(raster_paths, BUFFER_METERS)
model = build_fusion_model(cnn_patch_shape, gnn_input_dim, mlp_train.shape[1])
model.summary()

# ==================== 6. Create Data Generators ==================== #
# We create a separate generator for the validation data.
train_generator = DataGenerator(
    coords=coords_train,
    mlp_data=mlp_train,
    gnn_data=gnn_train,
    y=y_train,
    raster_paths=raster_paths,
    batch_size=batch_size,
    shuffle=True,
    buffer_meters=BUFFER_METERS
)

# Function to evaluate the model on the test set
def evaluate_model(model, coords_test, mlp_test, gnn_test_matrix, y_test, raster_paths, buffer_meters=BUFFER_METERS, batch_size=4, return_preds=False):
    num_samples = len(y_test)
    y_pred_list = []
    
    with rasterio.open(raster_paths[0]) as src:
        res_x, res_y = src.res
        buffer_pixels_x = int(buffer_meters / res_x)
        buffer_pixels_y = int(buffer_meters / res_y)
        patch_width = 2 * buffer_pixels_x
        patch_height = 2 * buffer_pixels_y

    for i in range(0, num_samples, batch_size):
        batch_coords = coords_test[i:i+batch_size]
        batch_mlp = mlp_test[i:i+batch_size]
        
        batch_gnn = gnn_test_matrix[i:i+batch_size, :]
        batch_y = y_test[i:i+batch_size]

        batch_cnn = extract_patch_for_generator(
            batch_coords,
            raster_paths,
            buffer_pixels_x,
            buffer_pixels_y,
            patch_width,
            patch_height
        )
        
        y_pred_list.append(model.predict((batch_cnn, batch_mlp, batch_gnn), verbose=0).flatten())
    
    y_pred = np.concatenate(y_pred_list)
    
    if return_preds:
        return y_pred
    else:
        # --- NaN FIX: Ensure y_pred has no NaNs before calculating metrics ---
        y_pred[np.isnan(y_pred)] = 0
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        return r2, rmse


# ==================== 7. Train Model ==================== #
print("\n" + "="*80)
print(f"Analyzing with CNN–GNN–MLP Model ({BUFFER_METERS}m)")
print("="*80)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    train_generator,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping],
    validation_data=train_generator # Using the same generator for validation for this example
)


# ==================== 8. Evaluate ==================== #
# Re-create a data generator without shuffling for evaluation on the training set
train_eval_generator = DataGenerator(
    coords=coords_train,
    mlp_data=mlp_train,
    gnn_data=gnn_train,
    y=y_train,
    raster_paths=raster_paths,
    batch_size=batch_size,
    shuffle=False,
    buffer_meters=BUFFER_METERS
)

y_pred_train = model.predict(train_eval_generator, verbose=0).flatten()
# --- NaN FIX: Ensure y_pred has no NaNs before calculating metrics ---
y_pred_train[np.isnan(y_pred_train)] = 0
r2_train = r2_score(y_train[:len(y_pred_train)], y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train[:len(y_pred_train)], y_pred_train))

r2_test, rmse_test = evaluate_model(model, coords_test, mlp_test, gnn_test, y_test, raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size)

print(f"\n✅ CNN–GNN–MLP Model Performance ({BUFFER_METERS}m):")
print(f"R² Train: {r2_train:.4f} | RMSE Train: {rmse_train:.4f}")
print(f"R² Test: {r2_test:.4f} | RMSE Test: {rmse_test:.4f}")

# ==================== 9. Feature Importance Analysis ==================== #
print("\n" + "-"*50)
print(f"Feature Importance Analysis for {BUFFER_METERS}m")
print("-"*50)

# --- 9.1 Combined Feature Importance (by Model Branch) ---
y_pred_baseline = evaluate_model(model, coords_test, mlp_test, gnn_test, y_test, raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size, return_preds=True)
y_pred_baseline[np.isnan(y_pred_baseline)] = 0
baseline_r2 = r2_score(y_test, y_pred_baseline)
print(f"\nBaseline Performance on Test Set: R² = {baseline_r2:.4f}")

# Ablate CNN branch
with rasterio.open(raster_paths[0]) as src:
    res_x, res_y = src.res
    buffer_pixels_x = int(BUFFER_METERS / res_x)
    buffer_pixels_y = int(BUFFER_METERS / res_y)
    patch_width = 2 * buffer_pixels_x
    patch_height = 2 * buffer_pixels_y

cnn_test_ablated = np.zeros_like(extract_patch_for_generator(
    coords_test, raster_paths, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height
))
y_pred_cnn_ablated = model.predict((cnn_test_ablated, mlp_test, gnn_test), verbose=0).flatten()
y_pred_cnn_ablated[np.isnan(y_pred_cnn_ablated)] = 0
r2_cnn_ablated = r2_score(y_test, y_pred_cnn_ablated)
importance_cnn = baseline_r2 - r2_cnn_ablated

# Ablate MLP branch
mlp_test_ablated = np.zeros_like(mlp_test)
y_pred_mlp_ablated = model.predict((extract_patch_for_generator(
    coords_test, raster_paths, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height
), mlp_test_ablated, gnn_test), verbose=0).flatten()
y_pred_mlp_ablated[np.isnan(y_pred_mlp_ablated)] = 0
r2_mlp_ablated = r2_score(y_test, y_pred_mlp_ablated)
importance_mlp = baseline_r2 - r2_mlp_ablated

# Ablate GNN branch
gnn_test_ablated = np.zeros_like(gnn_test)
y_pred_gnn_ablated = model.predict((extract_patch_for_generator(
    coords_test, raster_paths, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height
), mlp_test, gnn_test_ablated), verbose=0).flatten()
y_pred_gnn_ablated[np.isnan(y_pred_gnn_ablated)] = 0
r2_gnn_ablated = r2_score(y_test, y_pred_gnn_ablated)
importance_gnn = baseline_r2 - r2_gnn_ablated

print("\n--- Combined Feature Importance (by Model Branch) ---")
print(f"CNN Branch Importance (R² drop): {importance_cnn:.4f}")
print(f"MLP Branch Importance (R² drop): {importance_mlp:.4f}")
print(f"GNN Branch Importance (R² drop): {importance_gnn:.4f}")

# --- 9.2 MLP Feature Importance (Permutation-based) ---
mlp_feature_importance = {}
mlp_data_test_raw = test_orig[numeric_cols]
for i, feature_name in enumerate(mlp_data_test_raw.columns):
    mlp_test_shuffled = np.copy(mlp_test)
    np.random.shuffle(mlp_test_shuffled[:, i])
    
    y_pred_shuffled = model.predict((extract_patch_for_generator(
        coords_test, raster_paths, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height
    ), mlp_test_shuffled, gnn_test), verbose=0).flatten()
    y_pred_shuffled[np.isnan(y_pred_shuffled)] = 0
    r2_shuffled = r2_score(y_test, y_pred_shuffled)
    
    importance = baseline_r2 - r2_shuffled
    mlp_feature_importance[feature_name] = importance

print("\n--- MLP Feature Importance (Permutation-based) ---")
sorted_importance = sorted(mlp_feature_importance.items(), key=lambda item: item[1], reverse=True)
for feature, importance in sorted_importance:
    print(f"{feature:<20}: {importance:.4f}")
    
# ==================== 10. Save Model and Data for Reproducibility ==================== #
print("\n" + "="*80)
print("Saving Model, Data, and Feature Importance Results")
print("="*80)

# Create the single output directory
output_dir = "cnn_gnn_mlp"
os.makedirs(output_dir, exist_ok=True)

# Save the trained model in the Keras native format
model_filename = os.path.join(output_dir, f"fusion_model_{BUFFER_METERS}m.keras")
model.save(model_filename)
print(f"✅ Model saved to '{model_filename}'")

# Save the training history using pickle
history_filename = os.path.join(output_dir, "training_history.pkl")
with open(history_filename, 'wb') as f:
    pickle.dump(history.history, f)
print(f"✅ Training history saved to '{history_filename}'")

# --- New: Save Feature Importance Results ---
feature_importance_results = {
    "mlp_feature_names": test_orig[numeric_cols].columns.tolist(),
    "mlp_permutation_importance": mlp_feature_importance,
    "cnn_ablation_importance": importance_cnn,
    "mlp_ablation_importance": importance_mlp,
    "gnn_ablation_importance": importance_gnn
}
importance_filename = os.path.join(output_dir, "feature_importance.pkl")
with open(importance_filename, 'wb') as f:
    pickle.dump(feature_importance_results, f)
print(f"✅ Feature importance results saved to '{importance_filename}'")

# Save processed NumPy arrays for later use
np.savez_compressed(
    os.path.join(output_dir, "processed_train_data.npz"),
    coords=coords_train,
    mlp=mlp_train,
    y=y_train
)
np.savez_compressed(
    os.path.join(output_dir, "processed_test_data.npz"),
    coords=coords_test,
    mlp=mlp_test,
    y=y_test
)
np.savez_compressed(
    os.path.join(output_dir, "gnn_data.npz"),
    gnn_train=gnn_train,
    gnn_test=gnn_test
)
print(f"✅ Processed data arrays saved to '{output_dir}'")

# Save the raw dataframes to CSV for easy inspection
train_combined.to_csv(os.path.join(output_dir, "train_combined.csv"), index=False)
test_orig.to_csv(os.path.join(output_dir, "test_orig.csv"), index=False)
print(f"✅ Raw dataframes saved to '{output_dir}'")

# Garbage collect to free up memory
del model, history, train_generator
gc.collect()

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import gc # Import garbage collector
import pickle

# Define the single buffer size to use
BUFFER_METERS = 500

# ==================== 1. Load Data ==================== #
orig = pd.read_csv("../../data/WinterSeason1.csv")
river_100 = pd.read_csv("../data/Samples_100W.csv")
# Remove 'Source' column if it exists in river_100 dataframe
if 'Source' in river_100.columns:
    river_100.drop(columns="Source", inplace=True)

drop_cols = ['Stations', 'River', 'Lat', 'Long', 'geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')

# --- IMPUTATION FIX: Fill NaN values with 0 before further processing ---
orig.fillna(0, inplace=True)
river_100.fillna(0, inplace=True)

# --- Use an 80/20 train-test split for a larger test set ---
np.random.seed(42)
train_orig, test_orig = train_test_split(orig, test_size=0.2, random_state=42)

# Combine the river data with the new training set
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDW/*.tif")

print(f"Using {len(raster_paths)} raster layers for CNN input.")
for r in raster_paths:
    print("  -", os.path.basename(r))

# ==================== 3. Create a Custom Data Generator ==================== #
def extract_patch_for_generator(coords, raster_files, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height):
    """
    Extracts a batch of patches from rasters for a given set of coordinates.
    This function is optimized to be called by the data generator for each batch.
    """
    patches = []
    # Loop through each coordinate pair in the batch
    for lon, lat in coords:
        channels = []
        # Loop through each raster file to get a single patch for each raster
        for rfile in raster_files:
            with rasterio.open(rfile) as src:
                try:
                    row, col = src.index(lon, lat)
                    win = Window(col - buffer_pixels_x, row - buffer_pixels_y, patch_width, patch_height)
                    arr = src.read(1, window=win, boundless=True, fill_value=0)
                    arr = arr.astype(np.float32)

                    # --- NORMALIZATION FIX: Add a small epsilon to avoid division by zero ---
                    max_val = np.nanmax(arr)
                    if max_val != 0:
                        arr /= max_val + 1e-8 # Add epsilon for stability
                except Exception as e:
                    print(f"Error processing {rfile} for coordinates ({lon}, {lat}): {e}")
                    arr = np.zeros((patch_width, patch_height), dtype=np.float32)
            channels.append(arr)
        patches.append(np.stack(channels, axis=-1))
    
    return np.array(patches)

class DataGenerator(Sequence):
    def __init__(self, coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size=4, shuffle=True, **kwargs):
        super().__init__(**kwargs)
        self.coords = coords
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.raster_paths = raster_paths
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.buffer_meters = buffer_meters

        # Pre-calculate patch size from the first raster
        with rasterio.open(raster_paths[0]) as src:
            res_x, res_y = src.res
            self.buffer_pixels_x = int(self.buffer_meters / res_x)
            self.buffer_pixels_y = int(self.buffer_meters / res_y)
            self.patch_width = 2 * self.buffer_pixels_x
            self.patch_height = 2 * self.buffer_pixels_y

        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_coords = self.coords[batch_indices]
        batch_mlp = self.mlp_data[batch_indices]
        
        # Slice the GNN adjacency matrix for the current batch
        batch_gnn = self.gnn_data[batch_indices, :]

        batch_y = self.y[batch_indices]

        # Extract CNN patches for the current batch
        batch_cnn = extract_patch_for_generator(
            batch_coords,
            self.raster_paths,
            self.buffer_pixels_x,
            self.buffer_pixels_y,
            self.patch_width,
            self.patch_height
        )

        # Return a tuple of inputs and the target, which Keras expects
        return (batch_cnn, batch_mlp, batch_gnn), batch_y

# ==================== 4. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined[['Long', 'Lat']].values
coords_test = test_orig[['Long', 'Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
# --- IMPUTATION FIX: Fill NaN in raw MLP data before scaling ---
train_combined.fillna(0, inplace=True)
test_orig.fillna(0, inplace=True)
mlp_train = scaler.fit_transform(train_combined[numeric_cols])
mlp_test = scaler.transform(test_orig[numeric_cols])
y_train = train_combined['RI'].values
y_test = test_orig['RI'].values

# ==================== 5. Define Enhanced CNN–GNN–MLP Model ==================== #
def build_fusion_model(patch_shape, gnn_dim, mlp_dim):
    # CNN branch (for raster data)
    cnn_input = Input(shape=patch_shape, name="cnn_input")
    x = Conv2D(32, (3,3), activation="relu")(cnn_input)
    x = MaxPooling2D((2,2))(x)
    x = Conv2D(64, (3,3), activation="relu")(x)
    x = MaxPooling2D((2,2))(x)
    x = Flatten()(x)
    cnn_out = Dense(128, activation="relu", name="cnn_out")(x)

    # MLP branch (for numerical site features)
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    m = Dense(64, activation="relu")(mlp_input)
    mlp_out = Dense(32, activation="relu", name="mlp_out")(m)

    # GNN branch (for spatial connectivity)
    # The GNN input dimension is now the number of training samples
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    g = Dense(64, activation="relu")(gnn_input)
    gnn_out = Dense(32, activation="relu", name="gnn_out")(g)

    # Fusion Layer
    combined = Concatenate()([cnn_out, mlp_out, gnn_out])
    f = Dense(128, activation="relu")(combined)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)

    model = Model(inputs=[cnn_input, mlp_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

# We need to determine the final GNN input dimension for the model
# It's the total number of training samples
batch_size = 4
gnn_input_dim = len(coords_train)

# Helper function to get CNN patch shape from rasters
def get_cnn_patch_shape(raster_paths, buffer_meters):
    with rasterio.open(raster_paths[0]) as src:
        res_x, _ = src.res
        buffer_pixels = int(buffer_meters / res_x)
        return (2 * buffer_pixels, 2 * buffer_pixels, len(raster_paths))

cnn_patch_shape = get_cnn_patch_shape(raster_paths, BUFFER_METERS)


def smape(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    
    # Handle the case where both y_true and y_pred are zero to avoid division by zero
    return np.mean(numerator / (denominator + 1e-8)) * 100


def evaluate_model(model, coords, mlp_data, gnn_data, y_true, raster_paths, buffer_meters, batch_size=4, return_preds=False):
    """
    Evaluates the model on a given dataset and returns the metrics or predictions.
    """
    num_samples = len(y_true)
    y_pred_list = []
    
    with rasterio.open(raster_paths[0]) as src:
        res_x, res_y = src.res
        buffer_pixels_x = int(buffer_meters / res_x)
        buffer_pixels_y = int(buffer_meters / res_y)
        patch_width = 2 * buffer_pixels_x
        patch_height = 2 * buffer_pixels_y

    for i in range(0, num_samples, batch_size):
        batch_coords = coords[i:i+batch_size]
        batch_mlp = mlp_data[i:i+batch_size]
        batch_gnn = gnn_data[i:i+batch_size, :]
        
        batch_cnn = extract_patch_for_generator(
            batch_coords,
            raster_paths,
            buffer_pixels_x,
            buffer_pixels_y,
            patch_width,
            patch_height
        )
        
        y_pred_list.append(model.predict((batch_cnn, batch_mlp, batch_gnn), verbose=0).flatten())
        
    y_pred = np.concatenate(y_pred_list)
    
    # --- NaN FIX: Ensure y_pred has no NaNs before calculating metrics ---
    y_pred[np.isnan(y_pred)] = 0
    
    if return_preds:
        return y_pred
    else:
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        smap = smape(y_true, y_pred)
        return r2, mae, rmse, smap


# ==================== 6. K-Fold Cross-Validation and Model Saving ==================== #
print("\n" + "="*80)
print(f"Starting 5-Fold Cross-Validation for CNN–GNN–MLP Model ({BUFFER_METERS}m)")
print("="*80)

# Create the directory to save the models
output_dir = "models/cnn_gnn_mlp"
os.makedirs(output_dir, exist_ok=True)

# Combine all training data for K-Fold splitting
all_coords = coords_train
all_mlp = mlp_train
all_gnn = gnn_train
all_y = y_train

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []
best_r2_val = -np.inf
best_model = None
best_fold = -1
fold_num = 1

for train_index, val_index in kf.split(all_y):
    print(f"\n--- Fold {fold_num}/{kf.n_splits} ---")
    
    # Split the data for the current fold
    fold_train_coords, fold_val_coords = all_coords[train_index], all_coords[val_index]
    fold_train_mlp, fold_val_mlp = all_mlp[train_index], all_mlp[val_index]
    # GNN matrix slicing needs to be handled carefully. The adjacency matrix depends on the training data.
    fold_train_gnn = all_gnn[train_index, :]
    fold_val_gnn = all_gnn[val_index, :]
    fold_train_y, fold_val_y = all_y[train_index], all_y[val_index]
    
    # Create generators for the current fold's data
    fold_train_generator = DataGenerator(
        coords=fold_train_coords,
        mlp_data=fold_train_mlp,
        gnn_data=fold_train_gnn,
        y=fold_train_y,
        raster_paths=raster_paths,
        buffer_meters=BUFFER_METERS,
        batch_size=batch_size,
        shuffle=True
    )
    
    # Validation generator for evaluation
    fold_val_generator = DataGenerator(
        coords=fold_val_coords,
        mlp_data=fold_val_mlp,
        gnn_data=fold_val_gnn,
        y=fold_val_y,
        raster_paths=raster_paths,
        buffer_meters=BUFFER_METERS,
        batch_size=batch_size,
        shuffle=False
    )
    
    # Build and compile a new model for each fold
    model = build_fusion_model(cnn_patch_shape, gnn_input_dim, mlp_train.shape[1])
    
    # Define a unique filename for each fold's best model
    checkpoint_filepath = os.path.join(output_dir, f'best_model_fold_{fold_num}.keras')
    model_checkpoint_callback = ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=False,
        monitor='val_loss',
        mode='min',
        save_best_only=True
    )
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model
    history = model.fit(
        fold_train_generator,
        epochs=100,
        verbose=1,
        callbacks=[early_stopping, model_checkpoint_callback],
        validation_data=fold_val_generator
    )
    
    # Evaluate on the validation set
    val_r2, val_mae, val_rmse, val_smape = evaluate_model(
        model, 
        fold_val_coords, 
        fold_val_mlp, 
        fold_val_gnn, 
        fold_val_y, 
        raster_paths, 
        BUFFER_METERS, 
        batch_size
    )

    print(f"Fold {fold_num} - R²: {val_r2:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, SMAPE: {val_smape:.4f}")
    fold_results.append({
        'fold': fold_num,
        'val_r2': val_r2,
        'val_mae': val_mae,
        'val_rmse': val_rmse,
        'val_smape': val_smape
    })
    
    # Check if this fold produced the best model so far
    if val_r2 > best_r2_val:
        best_r2_val = val_r2
        best_fold = fold_num
        best_model = tf.keras.models.load_model(checkpoint_filepath)
        print(f"   -> New best model found in Fold {fold_num}")
    
    fold_num += 1
    
print("\n" + "="*80)
print("Cross-Validation Complete")
print(f"Best model from Fold {best_fold} with Validation R²: {best_r2_val:.4f}")
print("Loading the best model for final evaluation.")
model = best_model
print("="*80)

# ==================== 7. Final Evaluation on Test Set ==================== #
r2_test, mae_test, rmse_test, smape_test = evaluate_model(
    model, 
    coords_test, 
    mlp_test, 
    gnn_test, 
    y_test, 
    raster_paths, 
    buffer_meters=BUFFER_METERS, 
    batch_size=batch_size
)

print(f"\n✅ CNN–GNN–MLP Model Final Performance on Test Set ({BUFFER_METERS}m):")
print(f"R² Test: {r2_test:.4f} | MAE Test: {mae_test:.4f} | RMSE Test: {rmse_test:.4f} | SMAPE Test: {smape_test:.4f}")

Using 26 raster layers for CNN input.
  - bui.tif
  - ndsi.tif
  - savi.tif
  - ndbsi.tif
  - ui.tif
  - ndwi.tif
  - ndbi.tif
  - awei.tif
  - evi.tif
  - mndwi.tif
  - ndvi.tif
  - LULC2020.tif
  - LULC2021.tif
  - LULC2022.tif
  - LULC2019.tif
  - LULC2018.tif
  - LULC2017.tif
  - Pb_R.tif
  - ClayR.tif
  - SandR.tif
  - CdR.tif
  - CrR.tif
  - AsR.tif
  - SiltR.tif
  - CuR.tif
  - NiR.tif

Starting 5-Fold Cross-Validation for CNN–GNN–MLP Model (500m)

--- Fold 1/5 ---
Epoch 1/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 267ms/step - loss: 38174.6875 - val_loss: 30134.4414
Epoch 2/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 258ms/step - loss: 14420.9473 - val_loss: 5465.9009
Epoch 3/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 321ms/step - loss: 9699.5615 - val_loss: 4578.1162
Epoch 4/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 268ms/step - loss: 5740.7710 - val_loss: 7109.7998
Epoch 5/100


In [None]:
# ==================== 8. Save Final Results ==================== #
print("\n" + "="*80)
print("Saving Training and Evaluation Results")
print("="*80)

# Save the fold results and final metrics
results_filename = os.path.join(output_dir, "training_results.pkl")
final_metrics = {
    'test_r2': r2_test,
    'test_rmse': rmse_test,
    'kfold_results': fold_results
}
with open(results_filename, 'wb') as f:
    pickle.dump(final_metrics, f)
print(f"✅ Training results saved to '{results_filename}'")

# Save processed NumPy arrays for later use
np.savez_compressed(
    os.path.join(output_dir, "processed_train_data.npz"),
    coords=coords_train,
    mlp=mlp_train,
    y=y_train
)
np.savez_compressed(
    os.path.join(output_dir, "processed_test_data.npz"),
    coords=coords_test,
    mlp=mlp_test,
    y=y_test
)
np.savez_compressed(
    os.path.join(output_dir, "gnn_data.npz"),
    gnn_train=gnn_train,
    gnn_test=gnn_test
)
print(f"✅ Processed data arrays saved to '{output_dir}'")

# Save the raw dataframes to CSV for easy inspection
train_combined.to_csv(os.path.join(output_dir, "train_combined.csv"), index=False)
test_orig.to_csv(os.path.join(output_dir, "test_orig.csv"), index=False)
print(f"✅ Raw dataframes saved to '{output_dir}'")

# Garbage collect to free up memory
del model
gc.collect()


In [7]:
import numpy as np
import pandas as pd
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import gc
import pickle

# --- New Imports for LIME and plotting ---
try:
    from lime import lime_image
    from lime import lime_tabular
    from skimage.segmentation import mark_boundaries, slic
except ImportError:
    print("LIME and/or scikit-image not found. Please install with `pip install lime scikit-image`.")
    lime_image = None
    lime_tabular = None

import matplotlib.pyplot as plt

# Define the buffer size in meters
BUFFER_METERS = 500

# ==================== 1. Load Data ==================== #
orig = pd.read_csv("../../data/WinterSeason1.csv")
river_100 = pd.read_csv("../data/Samples_100W.csv")
river_100.drop(columns="Source", inplace=True)

drop_cols = ['Stations', 'River', 'Lat', 'Long', 'geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')

# --- IMPUTATION FIX: Fill NaN values with 0 before further processing ---
orig.fillna(0, inplace=True)
river_100.fillna(0, inplace=True)

# Train-test split
np.random.seed(42)
train_orig = orig.sample(10, random_state=42)
test_orig = orig.drop(train_orig.index)
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDWW/*.tif")

print(f"Using {len(raster_paths)} raster layers for CNN input.")
for r in raster_paths:
    print("  -", os.path.basename(r))

# ==================== 3. Global Patch Dimension Calculation ==================== #
# We calculate these globally so they are accessible by all functions and code blocks.
with rasterio.open(raster_paths[0]) as src:
    res_x, res_y = src.res
    buffer_pixels_x = int(BUFFER_METERS / res_x)
    buffer_pixels_y = int(BUFFER_METERS / res_y)
    patch_width = 2 * buffer_pixels_x
    patch_height = 2 * buffer_pixels_y
    print(f"\nCalculated patch size: {patch_width}x{patch_height} pixels.")


# ==================== 4. Create a Custom Data Generator ==================== #
def extract_patch_for_generator(coords, raster_files):
    """
    Extracts a batch of patches from rasters for a given set of coordinates.
    """
    patches = []
    # Loop through each coordinate pair in the batch
    for lon, lat in coords:
        channels = []
        # Loop through each raster file to get a single patch for each raster
        for rfile in raster_files:
            with rasterio.open(rfile) as src:
                try:
                    row, col = src.index(lon, lat)
                    win = Window(col - buffer_pixels_x, row - buffer_pixels_y, patch_width, patch_height)
                    arr = src.read(1, window=win, boundless=True, fill_value=0)
                    arr = arr.astype(np.float32)

                    # --- NORMALIZATION FIX: Add a small epsilon to avoid division by zero ---
                    max_val = np.nanmax(arr)
                    if max_val != 0:
                        arr /= max_val + 1e-8 # Add epsilon for stability
                except Exception as e:
                    print(f"Error processing {rfile} for coordinates ({lon}, {lat}): {e}")
                    arr = np.zeros((patch_width, patch_height), dtype=np.float32)
            channels.append(arr)
        patches.append(np.stack(channels, axis=-1))
    
    return np.array(patches)

class DataGenerator(Sequence):
    def __init__(self, coords, mlp_data, gnn_data, y, raster_paths, batch_size=4, shuffle=True, **kwargs):
        super().__init__(**kwargs)
        self.coords = coords
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.raster_paths = raster_paths
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_coords = self.coords[batch_indices]
        batch_mlp = self.mlp_data[batch_indices]
        
        # Slice the GNN adjacency matrix for the current batch
        batch_gnn = self.gnn_data[batch_indices, :]

        batch_y = self.y[batch_indices]

        # Extract CNN patches for the current batch
        batch_cnn = extract_patch_for_generator(
            batch_coords,
            self.raster_paths
        )

        # Return a tuple of inputs and the target, which Keras expects
        return (batch_cnn, batch_mlp, batch_gnn), batch_y

# ==================== 5. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined[['Long', 'Lat']].values
coords_test = test_orig[['Long', 'Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
# --- IMPUTATION FIX: Fill NaN in raw MLP data before scaling ---
train_combined.fillna(0, inplace=True)
test_orig.fillna(0, inplace=True)
mlp_train = scaler.fit_transform(train_combined[numeric_cols])
mlp_test = scaler.transform(test_orig[numeric_cols])
y_train = train_combined['RI'].values
y_test = test_orig['RI'].values

# ==================== 6. Define Enhanced CNN–GNN–MLP Model ==================== #
def build_fusion_model(patch_shape, gnn_dim, mlp_dim):
    # CNN branch (for raster data)
    cnn_input = Input(shape=patch_shape, name="cnn_input")
    x = Conv2D(32, (3,3), activation="relu")(cnn_input)
    x = MaxPooling2D((2,2))(x)
    x = Conv2D(64, (3,3), activation="relu")(x)
    x = MaxPooling2D((2,2))(x)
    x = Flatten()(x)
    cnn_out = Dense(128, activation="relu", name="cnn_out")(x)

    # MLP branch (for numerical site features)
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    m = Dense(64, activation="relu")(mlp_input)
    mlp_out = Dense(32, activation="relu", name="mlp_out")(m)

    # GNN branch (for spatial connectivity)
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    g = Dense(64, activation="relu")(gnn_input)
    gnn_out = Dense(32, activation="relu", name="gnn_out")(g)

    # Fusion Layer
    combined = Concatenate()([cnn_out, mlp_out, gnn_out])
    f = Dense(128, activation="relu")(combined)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)

    model = Model(inputs=[cnn_input, mlp_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

# We need to determine the final GNN input dimension for the model
# It's the total number of training samples
batch_size = 4
gnn_input_dim = len(coords_train)

cnn_patch_shape = (patch_width, patch_height, len(raster_paths))
model = build_fusion_model(cnn_patch_shape, gnn_input_dim, mlp_train.shape[1])
model.summary()

# ==================== 7. Create Data Generators ==================== #
train_generator = DataGenerator(
    coords=coords_train,
    mlp_data=mlp_train,
    gnn_data=gnn_train,
    y=y_train,
    raster_paths=raster_paths,
    batch_size=batch_size,
    shuffle=True
)

# --- NEW: Function to calculate SMAPE ---
def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE).
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    # Avoid division by zero
    return np.mean(numerator / (denominator + 1e-8)) * 100

# Function to evaluate the model on the test set
def evaluate_model(model, coords_test, mlp_test, gnn_test_matrix, y_test, raster_paths, batch_size=4, return_preds=False):
    num_samples = len(y_test)
    y_pred_list = []
    
    for i in range(0, num_samples, batch_size):
        batch_coords = coords_test[i:i+batch_size]
        batch_mlp = mlp_test[i:i+batch_size]
        
        batch_gnn = gnn_test_matrix[i:i+batch_size, :]
        batch_y = y_test[i:i+batch_size]

        batch_cnn = extract_patch_for_generator(
            batch_coords,
            raster_paths
        )
        
        y_pred_list.append(model.predict((batch_cnn, batch_mlp, batch_gnn), verbose=0).flatten())
    
    y_pred = np.concatenate(y_pred_list)
    
    if return_preds:
        return y_pred
    else:
        # --- NaN FIX: Ensure y_pred has no NaNs before calculating metrics ---
        y_pred[np.isnan(y_pred)] = 0
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        # --- NEW: Calculate MAE and SMAPE ---
        mae = mean_absolute_error(y_test, y_pred)
        smap_val = smape(y_test, y_pred)
        return r2, rmse, mae, smap_val


# ==================== 8. Train Model ==================== #
print("\n" + "="*80)
print(f"Analyzing with CNN–GNN–MLP Model ({BUFFER_METERS}m)")
print("="*80)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

history = model.fit(
    train_generator,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping],
    validation_data=train_generator # Using the same generator for validation for this example
)


# ==================== 9. Evaluate ==================== #
# Re-create a data generator without shuffling for evaluation on the training set
train_eval_generator = DataGenerator(
    coords=coords_train,
    mlp_data=mlp_train,
    gnn_data=gnn_train,
    y=y_train,
    raster_paths=raster_paths,
    batch_size=batch_size,
    shuffle=False
)

y_pred_train = model.predict(train_eval_generator, verbose=0).flatten()
# --- NaN FIX: Ensure y_pred has no NaNs before calculating metrics ---
y_pred_train[np.isnan(y_pred_train)] = 0
r2_train = r2_score(y_train[:len(y_pred_train)], y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train[:len(y_pred_train)], y_pred_train))
# --- NEW: Calculate MAE and SMAPE for training data ---
mae_train = mean_absolute_error(y_train[:len(y_pred_train)], y_pred_train)
smape_train = smape(y_train[:len(y_pred_train)], y_pred_train)

# --- NEW: Get all metrics for the test set ---
r2_test, rmse_test, mae_test, smape_test = evaluate_model(model, coords_test, mlp_test, gnn_test, y_test, raster_paths, batch_size=batch_size)

print(f"\n✅ CNN–GNN–MLP Model Performance ({BUFFER_METERS}m):")
print(f"R² Train: {r2_train:.4f} | RMSE Train: {rmse_train:.4f} | MAE Train: {mae_train:.4f} | SMAPE Train: {smape_train:.4f}%")
print(f"R² Test: {r2_test:.4f} | RMSE Test: {rmse_test:.4f} | MAE Test: {mae_test:.4f} | SMAPE Test: {smape_test:.4f}%")



Using 26 raster layers for CNN input.
  - bui.tif
  - ndsi.tif
  - savi.tif
  - ndbsi.tif
  - ui.tif
  - ndwi.tif
  - ndbi.tif
  - awei.tif
  - evi.tif
  - mndwi.tif
  - ndvi.tif
  - LULC2020.tif
  - LULC2021.tif
  - LULC2022.tif
  - LULC2019.tif
  - LULC2018.tif
  - LULC2017.tif
  - ClayW.tif
  - CdW.tif
  - SandW.tif
  - SiltW.tif
  - AsW.tif
  - CrW.tif
  - NiW.tif
  - PbW.tif
  - CuW.tif

Calculated patch size: 100x100 pixels.



Analyzing with CNN–GNN–MLP Model (500m)
Epoch 1/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 200ms/step - loss: 32034.9219 - val_loss: 6699.8760
Epoch 2/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 190ms/step - loss: 6382.7202 - val_loss: 6123.4521
Epoch 3/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 215ms/step - loss: 5196.4819 - val_loss: 4933.7158
Epoch 4/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 199ms/step - loss: 5357.5098 - val_loss: 7279.1553
Epoch 5/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 201ms/step - loss: 5834.1704 - val_loss: 5486.8887
Epoch 6/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 196ms/step - loss: 6644.4551 - val_loss: 5760.2104
Epoch 7/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 189ms/step - loss: 6641.4653 - val_loss: 4195.2559
Epoch 8/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [None]:
# ==================== 10. Feature Importance Analysis ==================== #
print("\n" + "-"*50)
print(f"Feature Importance Analysis for {BUFFER_METERS}m")
print("-"*50)

# --- 10.1 Permutation-based Feature Importance for ALL Branches ---
y_pred_baseline = evaluate_model(model, coords_test, mlp_test, gnn_test, y_test, raster_paths, batch_size=4, return_preds=True)
y_pred_baseline[np.isnan(y_pred_baseline)] = 0
baseline_r2 = r2_score(y_test, y_pred_baseline)
print(f"\nBaseline Performance on Test Set: R² = {baseline_r2:.4f}\n")

# Permutation for MLP features
mlp_feature_importance = {}
mlp_data_test_raw = test_orig[numeric_cols]
for i, feature_name in enumerate(mlp_data_test_raw.columns):
    mlp_test_shuffled = np.copy(mlp_test)
    np.random.shuffle(mlp_test_shuffled[:, i])
    
    y_pred_shuffled = model.predict((extract_patch_for_generator(
        coords_test, raster_paths
    ), mlp_test_shuffled, gnn_test), verbose=0).flatten()
    y_pred_shuffled[np.isnan(y_pred_shuffled)] = 0
    r2_shuffled = r2_score(y_test, y_pred_shuffled)
    
    importance = baseline_r2 - r2_shuffled
    mlp_feature_importance[feature_name] = importance

print("--- MLP Feature Importance (Permutation-based) ---")
sorted_importance_mlp = sorted(mlp_feature_importance.items(), key=lambda item: item[1], reverse=True)
for feature, importance in sorted_importance_mlp:
    print(f"{feature:<20}: {importance:.4f}")
    
# Permutation for CNN features
print("\n--- CNN Feature Importance (Permutation-based) ---")
cnn_feature_importance = {}
patch_test = extract_patch_for_generator(
    coords_test, raster_paths
)
for i, raster_path in enumerate(raster_paths):
    raster_name = os.path.basename(raster_path)
    
    # Create a shuffled copy of the CNN patches, but only for the current channel (raster)
    cnn_test_shuffled = np.copy(patch_test)
    np.random.shuffle(cnn_test_shuffled[:, :, :, i])
    
    y_pred_shuffled = model.predict((cnn_test_shuffled, mlp_test, gnn_test), verbose=0).flatten()
    y_pred_shuffled[np.isnan(y_pred_shuffled)] = 0
    r2_shuffled = r2_score(y_test, y_pred_shuffled)
    
    importance = baseline_r2 - r2_shuffled
    cnn_feature_importance[raster_name] = importance

sorted_importance_cnn = sorted(cnn_feature_importance.items(), key=lambda item: item[1], reverse=True)
for feature, importance in sorted_importance_cnn:
    print(f"{feature:<20}: {importance:.4f}")

# Permutation for GNN branch
print("\n--- GNN Feature Importance (Permutation-based) ---")
gnn_test_shuffled = np.copy(gnn_test)
np.random.shuffle(gnn_test_shuffled) # Shuffle the rows of the adjacency matrix
y_pred_shuffled_gnn = model.predict((patch_test, mlp_test, gnn_test_shuffled), verbose=0).flatten()
y_pred_shuffled_gnn[np.isnan(y_pred_shuffled_gnn)] = 0
r2_shuffled_gnn = r2_score(y_test, y_pred_shuffled_gnn)
importance_gnn = baseline_r2 - r2_shuffled_gnn
print(f"GNN Branch Importance: {importance_gnn:.4f}")

# --- 10.2 Intrinsic Feature Importance (from model weights/gradients) ---
print("\n--- Intrinsic Feature Importance ---")
# Get weights from the first MLP layer
mlp_weights = model.get_layer('mlp_out').get_weights()[0]
# Use the L1 norm of the weights to approximate importance for each MLP feature
mlp_intrinsic_importance = np.sum(np.abs(mlp_weights), axis=1)
mlp_intrinsic_dict = dict(zip(numeric_cols, mlp_intrinsic_importance))
sorted_intrinsic_mlp = sorted(mlp_intrinsic_dict.items(), key=lambda item: item[1], reverse=True)
print("\nMLP Intrinsic Importance (First Layer Weights):")
for feature, importance in sorted_intrinsic_mlp:
    print(f"{feature:<20}: {importance:.4f}")

# --- 10.3 LIME (Local Interpretable Model-agnostic Explanations) ---
if lime_image and lime_tabular:
    print("\n" + "="*50)
    print("LIME Explanations for a Test Sample")
    print("="*50)

    # Pick a sample to explain (e.g., the first test sample)
    sample_index = 0
    test_sample_coords = coords_test[sample_index:sample_index+1]
    test_sample_mlp = mlp_test[sample_index:sample_index+1]
    test_sample_gnn = gnn_test[sample_index:sample_index+1]
    test_sample_cnn = extract_patch_for_generator(
        test_sample_coords, raster_paths
    )
    
    # --- LIME for the MLP (Tabular) features ---
    def mlp_predict_fn(x):
        num_samples_lime = x.shape[0]
        cnn_input_batch = np.tile(test_sample_cnn, (num_samples_lime, 1, 1, 1))
        gnn_input_batch = np.tile(test_sample_gnn, (num_samples_lime, 1))
        
        return model.predict((cnn_input_batch, x, gnn_input_batch), verbose=0)

    explainer_mlp = lime_tabular.LimeTabularExplainer(
        training_data=mlp_train,
        feature_names=numeric_cols.tolist(),
        class_names=['RI'],
        mode='regression'
    )
    
    print("\nLocal Explanation for MLP features:")
    explanation_mlp = explainer_mlp.explain_instance(
        data_row=test_sample_mlp[0],
        predict_fn=mlp_predict_fn,
        num_features=len(numeric_cols)
    )
    # Corrected way to access the prediction for regression
    print(f"Prediction: {explanation_mlp.local_pred[0]:.4f}")
    print("Feature contributions:")
    for feature, weight in explanation_mlp.as_list():
        print(f"  - {feature}: {weight:.4f}")

    # --- LIME for the CNN (Image) features ---
    def cnn_predict_fn(images):
        num_samples_lime = images.shape[0]
        
        # LIME operates on the image, so we just need to pass the images and tile the other inputs
        # Normalization is important here, as LIME generates small, random values
        images_rescaled = images * np.nanmax(test_sample_cnn)

        mlp_input_batch = np.tile(test_sample_mlp, (num_samples_lime, 1))
        gnn_input_batch = np.tile(test_sample_gnn, (num_samples_lime, 1))
        
        # --- FIX: Return the 2D prediction array so LIME can correctly index it ---
        return model.predict((images_rescaled, mlp_input_batch, gnn_input_batch), verbose=0)
    
    # Create a new segmentation function that works with multi-channel images
    def custom_slic_segmentation(image):
        return slic(image, n_segments=50, compactness=10, start_label=1, channel_axis=-1)

    explainer_cnn = lime_image.LimeImageExplainer()

    print("\nLocal Explanation for CNN features (this may take a moment)...")
    explanation_cnn = explainer_cnn.explain_instance(
        image=test_sample_cnn[0],
        classifier_fn=cnn_predict_fn,
        top_labels=1,
        hide_color=0,
        num_samples=1000,
        # Pass the custom segmentation function here
        segmentation_fn=custom_slic_segmentation
    )
    
    temp, mask = explanation_cnn.get_image_and_mask(
        explanation_cnn.top_labels[0],
        positive_only=True,
        num_features=5,
        hide_rest=True
    )
    
    # --- FIX: Extract and process a single channel for plotting ---
    # We will use the first channel (index 0) of the multi-channel raster for visualization.
    # You can change this index to see explanations for other rasters in the stack.
    single_channel_image = temp[:, :, 0]
    
    # Use mark_boundaries to show the segments on the original single-channel image
    temp_with_boundaries = mark_boundaries(single_channel_image / np.max(single_channel_image), explanation_cnn.segments, mode='thick', color=(1, 0, 0))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
    ax1.imshow(temp_with_boundaries)
    ax1.set_title("Original Raster with Boundaries (Channel 0)")
    ax2.imshow(temp_with_boundaries, interpolation='none')
    ax2.imshow(mask, cmap='viridis', alpha=0.5)
    ax2.set_title("LIME Explanation Overlay (Channel 0)")
    plt.tight_layout()
    plt.show()
    print("\nLIME explanation plot has been generated.")


# ==================== 11. Save Model and Data for Reproducibility ==================== #
print("\n" + "="*80)
print("Saving Model, Data, and Feature Importance Results")
print("="*80)

# --- Save Feature Importance Results ---
feature_importance_results = {
    "mlp_feature_names": test_orig[numeric_cols].columns.tolist(),
    "mlp_permutation_importance": mlp_feature_importance,
    "cnn_permutation_importance": cnn_feature_importance,
    "gnn_permutation_importance": importance_gnn,
    "mlp_intrinsic_importance": mlp_intrinsic_dict,
    "lime_analysis_performed": True 
}

In [None]:
# Permutation for GNN branch (new)
print("\n--- GNN Feature Importance (Permutation-based) ---")
gnn_test_shuffled = np.copy(gnn_test)
np.random.shuffle(gnn_test_shuffled) # Shuffle the rows of the adjacency matrix
y_pred_shuffled_gnn = model.predict((patch_test, mlp_test, gnn_test_shuffled), verbose=0).flatten()
y_pred_shuffled_gnn[np.isnan(y_pred_shuffled_gnn)] = 0
r2_shuffled_gnn = r2_score(y_test, y_pred_shuffled_gnn)
importance_gnn = baseline_r2 - r2_shuffled_gnn
print(f"GNN Branch Importance: {importance_gnn:.4f}")

# --- 9.2 Intrinsic Feature Importance (from model weights/gradients) ---
# This method approximates importance by analyzing the model's internal structure.
print("\n--- Intrinsic Feature Importance ---")
# Get weights from the first MLP layer
mlp_weights = model.get_layer('mlp_out').get_weights()[0]
# Use the L1 norm of the weights to approximate importance for each MLP feature
mlp_intrinsic_importance = np.sum(np.abs(mlp_weights), axis=1)
mlp_intrinsic_dict = dict(zip(numeric_cols, mlp_intrinsic_importance))
sorted_intrinsic_mlp = sorted(mlp_intrinsic_dict.items(), key=lambda item: item[1], reverse=True)
print("\nMLP Intrinsic Importance (First Layer Weights):")
for feature, importance in sorted_intrinsic_mlp:
    print(f"{feature:<20}: {importance:.4f}")

# --- 9.3 LIME (Local Interpretable Model-agnostic Explanations) ---
# LIME explains a single prediction by perturbing a single sample and fitting a simple, local model.
# We'll use two LIME explainers: one for the image data (CNN) and one for the tabular data (MLP).
if lime_image and lime_tabular:
    print("\n" + "="*50)
    print("LIME Explanations for a Test Sample")
    print("="*50)

    # Pick a sample to explain (e.g., the first test sample)
    sample_index = 0
    test_sample_coords = coords_test[sample_index:sample_index+1]
    test_sample_mlp = mlp_test[sample_index:sample_index+1]
    test_sample_gnn = gnn_test[sample_index:sample_index+1]
    test_sample_cnn = extract_patch_for_generator(
        test_sample_coords, raster_paths, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height
    )
    
    # --- LIME for the MLP (Tabular) features ---
    # The predict_fn for LIME needs to return the model's prediction for a given input batch.
    # We must keep the CNN and GNN inputs constant for this explanation.
    def mlp_predict_fn(x):
        # The LIME explainer perturbs the tabular data (x)
        # We need to reshape x to have the correct number of samples.
        num_samples_lime = x.shape[0]
        # Replicate the CNN and GNN inputs for each LIME sample
        cnn_input_batch = np.tile(test_sample_cnn, (num_samples_lime, 1, 1, 1))
        gnn_input_batch = np.tile(test_sample_gnn, (num_samples_lime, 1))
        
        return model.predict((cnn_input_batch, x, gnn_input_batch), verbose=0).flatten()

    explainer_mlp = lime_tabular.LimeTabularExplainer(
        training_data=mlp_train,
        feature_names=numeric_cols.tolist(),
        class_names=['RI'],
        mode='regression'
    )
    
    print("\nLocal Explanation for MLP features:")
    explanation_mlp = explainer_mlp.explain_instance(
        data_row=test_sample_mlp[0],
        predict_fn=mlp_predict_fn,
        num_features=len(numeric_cols)
    )
    explanation_mlp.as_list()
    # Print a textual explanation of the MLP features
    print(f"Prediction: {explanation_mlp.predict_proba[0]:.4f}")
    print("Feature contributions:")
    for feature, weight in explanation_mlp.as_list():
        print(f"  - {feature}: {weight:.4f}")

    # --- LIME for the CNN (Image) features ---
    # The predict_fn for LIME needs to return the model's prediction for a given image batch.
    # We must keep the MLP and GNN inputs constant for this explanation.
    def cnn_predict_fn(images):
        # The LIME explainer perturbs the image data.
        # It expects a predict_fn that takes a batch of images and returns an array of predictions.
        # The shape of the images passed by LIME is (num_samples, width, height, channels).
        num_samples_lime = images.shape[0]
        
        # LIME generates images with values between 0 and 1, we must scale them back to match our model's input.
        images_rescaled = images * np.nanmax(test_sample_cnn)

        # Replicate the MLP and GNN inputs for each LIME sample
        mlp_input_batch = np.tile(test_sample_mlp, (num_samples_lime, 1))
        gnn_input_batch = np.tile(test_sample_gnn, (num_samples_lime, 1))
        
        return model.predict((images_rescaled, mlp_input_batch, gnn_input_batch), verbose=0).flatten()

    explainer_cnn = lime_image.LimeImageExplainer()

    # The explanation is computationally expensive, so we'll do it for one image.
    print("\nLocal Explanation for CNN features (this may take a moment)...")
    explanation_cnn = explainer_cnn.explain_instance(
        image=test_sample_cnn[0],
        classifier_fn=cnn_predict_fn,
        top_labels=1, # We only have one output class (RI)
        hide_color=0, # Color to use for masked pixels
        num_samples=1000 # Number of perturbed samples to generate for the explanation
    )
    
    # Get the explanation for the top label (our single RI output)
    temp, mask = explanation_cnn.get_image_and_mask(
        explanation_cnn.top_labels[0],
        positive_only=True,
        num_features=5, # Show the top 5 contributing features
        hide_rest=True
    )

    # Plot the explanation
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
    ax1.imshow(test_sample_cnn[0, :, :, 0], cmap='viridis') # Show one channel of the original image
    ax1.set_title("Original Raster (Channel 0)")
    ax2.imshow(test_sample_cnn[0, :, :, 0] / np.max(test_sample_cnn[0, :, :, 0]), cmap='viridis', interpolation='none')
    ax2.imshow(mask, cmap='gray', alpha=0.5)
    ax2.set_title("LIME Explanation Overlay")
    plt.tight_layout()
    plt.show()
    print("\nLIME explanation plot has been generated.")


# ==================== 10. Save Model and Data for Reproducibility ==================== #
print("\n" + "="*80)
print("Saving Model, Data, and Feature Importance Results")
print("="*80)

# --- Save Feature Importance Results ---
feature_importance_results = {
    "mlp_feature_names": test_orig[numeric_cols].columns.tolist(),
    "mlp_permutation_importance": mlp_feature_importance,
    "cnn_permutation_importance": cnn_feature_importance,
    "gnn_permutation_importance": importance_gnn,
    "mlp_intrinsic_importance": mlp_intrinsic_dict,
    # LIME results are for a single sample and are best analyzed interactively,
    # so we'll just note their presence.
    "lime_analysis_performed": True 
}

