In [1]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input,
    Conv2D,
    MaxPooling2D,
    Flatten,
    Dense,
    Concatenate,
    Dropout,
    Layer,
    Lambda,
    GlobalAveragePooling2D,
    Reshape,
    Multiply
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import gc # Import garbage collector
import sys
from io import StringIO
import pickle

# Define the single buffer size to use
BUFFER_METERS = 500

# ==================== 1. Load Data ==================== #
# NOTE: The data loading logic remains the same.
# Replace with your actual data paths if needed
orig = pd.read_csv("../../data/WinterSeason1.csv")
river_100 = pd.read_csv("../data/Samples_100W.csv")

drop_cols = ['Stations','River','Lat','Long','geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')

# Train-test split
train_orig = orig.sample(10, random_state=42)
test_orig = orig.drop(train_orig.index)
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDWW/*.tif")

print(f"Using {len(raster_paths)} raster layers for CNN input.")
for r in raster_paths:
    print("  -", os.path.basename(r))

# ==================== 3. Create a Custom Data Generator ==================== #
def extract_patch_for_generator(coords, raster_files, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height):
    """
    Extracts a batch of patches from rasters for a given set of coordinates.
    This function is optimized to be called by the data generator for each batch.
    """
    patches = []
    # Loop through each coordinate pair in the batch
    for lon, lat in coords:
        channels = []
        # Loop through each raster file to get a single patch for each raster
        for rfile in raster_files:
            with rasterio.open(rfile) as src:
                try:
                    row, col = src.index(lon, lat)
                    win = Window(col - buffer_pixels_x, row - buffer_pixels_y, patch_width, patch_height)
                    arr = src.read(1, window=win, boundless=True, fill_value=0)
                    arr = arr.astype(np.float32)

                    if np.nanmax(arr) != 0:
                        arr /= np.nanmax(arr)
                except Exception as e:
                    print(f"Error processing {rfile} for coordinates ({lon}, {lat}): {e}")
                    arr = np.zeros((patch_width, patch_height), dtype=np.float32)
            channels.append(arr)
        patches.append(np.stack(channels, axis=-1))
    
    return np.array(patches)

class DataGenerator(Sequence):
    def __init__(self, coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size=4, shuffle=True, **kwargs):
        super().__init__(**kwargs)
        self.coords = coords
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.raster_paths = raster_paths
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.buffer_meters = buffer_meters

        # Pre-calculate patch size from the first raster
        with rasterio.open(raster_paths[0]) as src:
            res_x, res_y = src.res
            self.buffer_pixels_x = int(self.buffer_meters / res_x)
            self.buffer_pixels_y = int(self.buffer_meters / res_y)
            self.patch_width = 2 * self.buffer_pixels_x
            self.patch_height = 2 * self.buffer_pixels_y

        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_coords = self.coords[batch_indices]
        batch_mlp = self.mlp_data[batch_indices]
        batch_gnn = self.gnn_data[batch_indices, :]
        batch_y = self.y[batch_indices]

        # Extract CNN patches for the current batch
        batch_cnn = extract_patch_for_generator(
            batch_coords,
            self.raster_paths,
            self.buffer_pixels_x,
            self.buffer_pixels_y,
            self.patch_width,
            self.patch_height
        )

        return (batch_cnn, batch_mlp, batch_gnn), batch_y

# ==================== 4. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined[['Long','Lat']].values
coords_test = test_orig[['Long','Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
mlp_train = scaler.fit_transform(train_combined[numeric_cols])
mlp_test = scaler.transform(test_orig[numeric_cols])
y_train = train_combined['RI'].values
y_test = test_orig['RI'].values

# ==================== 5. Define Base Models ==================== #
def build_cnn_mlp_model(patch_shape, mlp_dim):
    cnn_input = Input(shape=patch_shape, name="cnn_input")
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")

    # CNN branch
    cnn_branch = Conv2D(32, (3,3), activation="relu", padding="same")(cnn_input)
    cnn_branch = MaxPooling2D((2,2))(cnn_branch)
    cnn_branch = Conv2D(64, (3,3), activation="relu", padding="same")(cnn_branch)
    cnn_branch = MaxPooling2D((2,2))(cnn_branch)
    cnn_embedding = Flatten()(cnn_branch)

    # MLP branch
    mlp_embedding = Dense(64, activation="relu")(mlp_input)
    mlp_embedding = Dense(32, activation="relu")(mlp_embedding)

    # Combine
    combined = Concatenate()([cnn_embedding, mlp_embedding])
    f = Dense(128, activation="relu")(combined)
    output = Dense(1, activation="linear", name="cnn_mlp_output")(f)
    
    model = Model(inputs=[cnn_input, mlp_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def build_gnn_mlp_model(gnn_dim, mlp_dim):
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")

    # GNN branch
    gnn_embedding = Dense(64, activation="relu")(gnn_input)
    gnn_embedding = Dense(32, activation="relu")(gnn_embedding)

    # MLP branch
    mlp_embedding = Dense(64, activation="relu")(mlp_input)
    mlp_embedding = Dense(32, activation="relu")(mlp_embedding)

    # Combine
    combined = Concatenate()([gnn_embedding, mlp_embedding])
    f = Dense(64, activation="relu")(combined)
    output = Dense(1, activation="linear", name="gnn_mlp_output")(f)
    
    model = Model(inputs=[gnn_input, mlp_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def build_cnn_gnn_model(patch_shape, gnn_dim):
    cnn_input = Input(shape=patch_shape, name="cnn_input")
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")

    # CNN branch
    cnn_branch = Conv2D(32, (3,3), activation="relu", padding="same")(cnn_input)
    cnn_branch = MaxPooling2D((2,2))(cnn_branch)
    cnn_branch = Conv2D(64, (3,3), activation="relu", padding="same")(cnn_branch)
    cnn_branch = MaxPooling2D((2,2))(cnn_branch)
    cnn_embedding = Flatten()(cnn_branch)
    
    # GNN branch
    gnn_embedding = Dense(64, activation="relu")(gnn_input)
    gnn_embedding = Dense(32, activation="relu")(gnn_embedding)

    # Combine
    combined = Concatenate()([cnn_embedding, gnn_embedding])
    f = Dense(128, activation="relu")(combined)
    output = Dense(1, activation="linear", name="cnn_gnn_output")(f)
    
    model = Model(inputs=[cnn_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def build_meta_learner_model():
    # Takes predictions from the 3 base models as input
    pred1_input = Input(shape=(1,), name="pred1_input")
    pred2_input = Input(shape=(1,), name="pred2_input")
    pred3_input = Input(shape=(1,), name="pred3_input")

    # Concatenate the predictions
    combined = Concatenate()([pred1_input, pred2_input, pred3_input])
    
    # Simple MLP as the meta-learner
    f = Dense(32, activation="relu")(combined)
    f = Dense(16, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)
    
    model = Model(inputs=[pred1_input, pred2_input, pred3_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

# ==================== 6. Create Data Generators for Base Models ==================== #
# NOTE: We create generators that provide only the necessary inputs for each base model.
class CNNDropoutGenerator(DataGenerator):
    def __getitem__(self, index):
        (batch_cnn, batch_mlp, batch_gnn), batch_y = super().__getitem__(index)
        return (batch_cnn, batch_mlp), batch_y

class GNNDropoutGenerator(DataGenerator):
    def __getitem__(self, index):
        (batch_cnn, batch_mlp, batch_gnn), batch_y = super().__getitem__(index)
        return (batch_gnn, batch_mlp), batch_y

class MLPDropoutGenerator(DataGenerator):
    def __getitem__(self, index):
        (batch_cnn, batch_mlp, batch_gnn), batch_y = super().__getitem__(index)
        return (batch_cnn, batch_gnn), batch_y

def get_base_model_predictions(model, coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size):
    num_samples = len(y)
    y_pred_list = []
    
    with rasterio.open(raster_paths[0]) as src:
        res_x, res_y = src.res
        buffer_pixels_x = int(buffer_meters / res_x)
        buffer_pixels_y = int(buffer_meters / res_y)
        patch_width = 2 * buffer_pixels_x
        patch_height = 2 * buffer_pixels_y

    for i in range(0, num_samples, batch_size):
        batch_coords = coords[i:i+batch_size]
        batch_mlp = mlp_data[i:i+batch_size]
        batch_gnn = gnn_data[i:i+batch_size, :]
        
        batch_cnn = extract_patch_for_generator(
            batch_coords, raster_paths, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height
        )
        
        # Check which inputs the model expects and provide them
        input_names = [inp.name for inp in model.inputs]
        input_dict = {}
        if 'cnn_input' in input_names:
            input_dict['cnn_input'] = batch_cnn
        if 'mlp_input' in input_names:
            input_dict['mlp_input'] = batch_mlp
        if 'gnn_input' in input_names:
            input_dict['gnn_input'] = batch_gnn
            
        y_pred_list.append(model.predict(input_dict).flatten())
            
    return np.concatenate(y_pred_list)



print("\n" + "="*80)
print(f"Analyzing Stacked Deep Ensemble for BUFFER_METERS = {BUFFER_METERS}m")
print("="*80)

batch_size = 4
gnn_input_dim = len(coords_train)

# Calculate CNN patch shape based on the current buffer size
with rasterio.open(raster_paths[0]) as src:
    res_x, res_y = src.res
    buffer_pixels_x = int(BUFFER_METERS / res_x)
    patch_width = 2 * buffer_pixels_x
    cnn_patch_shape = (patch_width, patch_width, len(raster_paths))

mlp_input_dim = mlp_train.shape[1]

# --- Train Base Models ---
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

print("\n--- Training CNN-MLP Base Model ---")
cnn_mlp_model = build_cnn_mlp_model(cnn_patch_shape, mlp_input_dim)
cnn_mlp_train_gen = CNNDropoutGenerator(
    coords=coords_train, mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    raster_paths=raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size, shuffle=True
)
cnn_mlp_model.fit(cnn_mlp_train_gen, epochs=1, verbose=1, callbacks=[early_stopping], validation_data=cnn_mlp_train_gen)

print("\n--- Training GNN-MLP Base Model ---")
gnn_mlp_model = build_gnn_mlp_model(gnn_input_dim, mlp_input_dim)
gnn_mlp_train_gen = GNNDropoutGenerator(
    coords=coords_train, mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    raster_paths=raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size, shuffle=True
)
gnn_mlp_model.fit(gnn_mlp_train_gen, epochs=1, verbose=1, callbacks=[early_stopping], validation_data=gnn_mlp_train_gen)

print("\n--- Training CNN-GNN Base Model ---")
cnn_gnn_model = build_cnn_gnn_model(cnn_patch_shape, gnn_input_dim)
cnn_gnn_train_gen = MLPDropoutGenerator(
    coords=coords_train, mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    raster_paths=raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size, shuffle=True
)
cnn_gnn_model.fit(cnn_gnn_train_gen, epochs=1, verbose=1, callbacks=[early_stopping], validation_data=cnn_gnn_train_gen)

# --- Generate predictions for meta-learner ---
# Get predictions from base models on training data
preds1_train = get_base_model_predictions(cnn_mlp_model, coords_train, mlp_train, gnn_train, y_train, raster_paths, BUFFER_METERS, batch_size)
preds2_train = get_base_model_predictions(gnn_mlp_model, coords_train, mlp_train, gnn_train, y_train, raster_paths, BUFFER_METERS, batch_size)
preds3_train = get_base_model_predictions(cnn_gnn_model, coords_train, mlp_train, gnn_train, y_train, raster_paths, BUFFER_METERS, batch_size)

meta_train_inputs = (preds1_train.reshape(-1, 1), preds2_train.reshape(-1, 1), preds3_train.reshape(-1, 1))

# --- Train Meta-Learner ---
print("\n--- Training Meta-Learner Model ---")
meta_model = build_meta_learner_model()
meta_model.fit(meta_train_inputs, y_train, epochs=1, verbose=1, callbacks=[early_stopping], validation_split=0.2)

# --- Get predictions from base models on test data ---
preds1_test = get_base_model_predictions(cnn_mlp_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)
preds2_test = get_base_model_predictions(gnn_mlp_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)
preds3_test = get_base_model_predictions(cnn_gnn_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)

meta_test_inputs = (preds1_test.reshape(-1, 1), preds2_test.reshape(-1, 1), preds3_test.reshape(-1, 1))

# --- Evaluate with Meta-Learner ---
y_pred = meta_model.predict(meta_test_inputs).flatten()
r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\n Stacked Deep Ensemble Model Performance ({BUFFER_METERS}m):")
print(f"R² Test: {r2_test:.4f} | RMSE Test: {rmse_test:.4f}")

# --- NEW: Feature Importance for Meta-Learner ---
print("\n" + "-"*50)
print(f"Meta-Learner Feature Importance (Permutation-based)")
print("-"*50)
baseline_r2 = r2_test

# Importance for CNN-MLP predictions
preds1_test_shuffled = np.copy(preds1_test)
np.random.shuffle(preds1_test_shuffled)
shuffled_test_inputs = (preds1_test_shuffled.reshape(-1, 1), preds2_test.reshape(-1, 1), preds3_test.reshape(-1, 1))
y_pred_shuffled = meta_model.predict(shuffled_test_inputs).flatten()
r2_shuffled = r2_score(y_test, y_pred_shuffled)
importance_cnn_mlp = baseline_r2 - r2_shuffled
print(f"Importance of CNN-MLP predictions (R² drop): {importance_cnn_mlp:.4f}")

# Importance for GNN-MLP predictions
preds2_test_shuffled = np.copy(preds2_test)
np.random.shuffle(preds2_test_shuffled)
shuffled_test_inputs = (preds1_test.reshape(-1, 1), preds2_test_shuffled.reshape(-1, 1), preds3_test.reshape(-1, 1))
y_pred_shuffled = meta_model.predict(shuffled_test_inputs).flatten()
r2_shuffled = r2_score(y_test, y_pred_shuffled)
importance_gnn_mlp = baseline_r2 - r2_shuffled
print(f"Importance of GNN-MLP predictions (R² drop): {importance_gnn_mlp:.4f}")

# Importance for CNN-GNN predictions
preds3_test_shuffled = np.copy(preds3_test)
np.random.shuffle(preds3_test_shuffled)
shuffled_test_inputs = (preds1_test.reshape(-1, 1), preds2_test.reshape(-1, 1), preds3_test_shuffled.reshape(-1, 1))
y_pred_shuffled = meta_model.predict(shuffled_test_inputs).flatten()
r2_shuffled = r2_score(y_test, y_pred_shuffled)
importance_cnn_gnn = baseline_r2 - r2_shuffled
print(f"Importance of CNN-GNN predictions (R² drop): {importance_cnn_gnn:.4f}")

Using 26 raster layers for CNN input.
  - bui.tif
  - ndsi.tif
  - savi.tif
  - ndbsi.tif
  - ui.tif
  - ndwi.tif
  - ndbi.tif
  - awei.tif
  - evi.tif
  - mndwi.tif
  - ndvi.tif
  - LULC2020.tif
  - LULC2021.tif
  - LULC2022.tif
  - LULC2019.tif
  - LULC2018.tif
  - LULC2017.tif
  - ClayW.tif
  - CdW.tif
  - SandW.tif
  - SiltW.tif
  - AsW.tif
  - CrW.tif
  - NiW.tif
  - PbW.tif
  - CuW.tif

Analyzing Stacked Deep Ensemble for BUFFER_METERS = 500m

--- Training CNN-MLP Base Model ---
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 196ms/step - loss: 118237.6172 - val_loss: 17236.2461

--- Training GNN-MLP Base Model ---
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 172ms/step - loss: 29643.0469 - val_loss: 32812.8906

--- Training CNN-GNN Base Model ---
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 192ms/step - loss: 23887.0762 - val_loss: 10435.9639
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m 

In [4]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_error,
)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input,
    Conv2D,
    MaxPooling2D,
    Flatten,
    Dense,
    Concatenate,
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import gc # Import garbage collector
import sys
from io import StringIO
import pickle

# --- New Imports for Feature Importance ---
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt
from tensorflow.keras.models import load_model # Used to save and load models for a clean pipeline


# Define the single buffer size to use
BUFFER_METERS = 500

# ==================== 1. Load Data ==================== #
# NOTE: The data loading logic remains the same.
# Replace with your actual data paths if needed
orig = pd.read_csv("../../data/WinterSeason1.csv")
river_100 = pd.read_csv("../data/Samples_100W.csv")

drop_cols = ['Stations','River','Lat','Long','geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')

# Train-test split
train_orig = orig.sample(10, random_state=42)
test_orig = orig.drop(train_orig.index)
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDWW/*.tif")

print(f"Using {len(raster_paths)} raster layers for CNN input.")
for r in raster_paths:
    print("  -", os.path.basename(r))

# ==================== 3. Create a Custom Data Generator ==================== #
def extract_patch_for_generator(coords, raster_files, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height, skip_raster_idx=None):
    """
    Extracts a batch of patches from rasters for a given set of coordinates.
    This function is optimized to be called by the data generator for each batch.
    
    Args:
        coords (np.ndarray): Array of (lon, lat) coordinates.
        raster_files (list): List of file paths to the raster layers.
        buffer_pixels_x (int): Number of pixels for x-dimension of buffer.
        buffer_pixels_y (int): Number of pixels for y-dimension of buffer.
        patch_width (int): Width of the patch in pixels.
        patch_height (int): Height of the patch in pixels.
        skip_raster_idx (int, optional): Index of the raster channel to zero out. Used for permutation importance.
    
    Returns:
        np.ndarray: A batch of image patches.
    """
    patches = []
    # Loop through each coordinate pair in the batch
    for lon, lat in coords:
        channels = []
        # Loop through each raster file to get a single patch for each raster
        for i, rfile in enumerate(raster_files):
            # Check if this raster channel should be zeroed out
            if i == skip_raster_idx:
                arr = np.zeros((patch_width, patch_height), dtype=np.float32)
            else:
                try:
                    with rasterio.open(rfile) as src:
                        row, col = src.index(lon, lat)
                        win = Window(col - buffer_pixels_x, row - buffer_pixels_y, patch_width, patch_height)
                        arr = src.read(1, window=win, boundless=True, fill_value=0)
                        arr = arr.astype(np.float32)

                        # Normalize the array if it has non-zero values
                        if np.nanmax(arr) != 0:
                            arr /= np.nanmax(arr)
                except Exception as e:
                    print(f"Error processing {rfile} for coordinates ({lon}, {lat}): {e}")
                    arr = np.zeros((patch_width, patch_height), dtype=np.float32)
            channels.append(arr)
        patches.append(np.stack(channels, axis=-1))
    
    return np.array(patches)

class DataGenerator(Sequence):
    def __init__(self, coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size=4, shuffle=True, **kwargs):
        super().__init__(**kwargs)
        self.coords = coords
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.raster_paths = raster_paths
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.buffer_meters = buffer_meters
        
        # Pre-calculate patch size from the first raster
        with rasterio.open(raster_paths[0]) as src:
            res_x, res_y = src.res
            self.buffer_pixels_x = int(self.buffer_meters / res_x)
            self.buffer_pixels_y = int(self.buffer_meters / res_y)
            self.patch_width = 2 * self.buffer_pixels_x
            self.patch_height = 2 * self.buffer_pixels_y

        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_coords = self.coords[batch_indices]
        batch_mlp = self.mlp_data[batch_indices]
        batch_gnn = self.gnn_data[batch_indices, :]
        batch_y = self.y[batch_indices]

        # Extract CNN patches for the current batch
        batch_cnn = extract_patch_for_generator(
            batch_coords,
            self.raster_paths,
            self.buffer_pixels_x,
            self.buffer_pixels_y,
            self.patch_width,
            self.patch_height
        )

        return (batch_cnn, batch_mlp, batch_gnn), batch_y

# ==================== 4. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined[['Long','Lat']].values
coords_test = test_orig[['Long','Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
mlp_train = scaler.fit_transform(train_combined[numeric_cols])
mlp_test = scaler.transform(test_orig[numeric_cols])
y_train = train_combined['RI'].values
y_test = test_orig['RI'].values

# ==================== 5. Define Base Models ==================== #
def build_cnn_mlp_model(patch_shape, mlp_dim):
    cnn_input = Input(shape=patch_shape, name="cnn_input")
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")

    # CNN branch
    cnn_branch = Conv2D(32, (3,3), activation="relu", padding="same")(cnn_input)
    cnn_branch = MaxPooling2D((2,2))(cnn_branch)
    cnn_branch = Conv2D(64, (3,3), activation="relu", padding="same")(cnn_branch)
    cnn_branch = MaxPooling2D((2,2))(cnn_branch)
    cnn_embedding = Flatten()(cnn_branch)

    # MLP branch
    mlp_embedding = Dense(64, activation="relu")(mlp_input)
    mlp_embedding = Dense(32, activation="relu")(mlp_embedding)

    # Combine
    combined = Concatenate()([cnn_embedding, mlp_embedding])
    f = Dense(128, activation="relu")(combined)
    output = Dense(1, activation="linear", name="cnn_mlp_output")(f)
    
    model = Model(inputs=[cnn_input, mlp_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def build_gnn_mlp_model(gnn_dim, mlp_dim):
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")

    # GNN branch
    gnn_embedding = Dense(64, activation="relu")(gnn_input)
    gnn_embedding = Dense(32, activation="relu")(gnn_embedding)

    # MLP branch
    mlp_embedding = Dense(64, activation="relu")(mlp_input)
    mlp_embedding = Dense(32, activation="relu")(mlp_embedding)

    # Combine
    combined = Concatenate()([gnn_embedding, mlp_embedding])
    f = Dense(64, activation="relu")(combined)
    output = Dense(1, activation="linear", name="gnn_mlp_output")(f)
    
    model = Model(inputs=[gnn_input, mlp_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def build_cnn_gnn_model(patch_shape, gnn_dim):
    cnn_input = Input(shape=patch_shape, name="cnn_input")
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")

    # CNN branch
    cnn_branch = Conv2D(32, (3,3), activation="relu", padding="same")(cnn_input)
    cnn_branch = MaxPooling2D((2,2))(cnn_branch)
    cnn_branch = Conv2D(64, (3,3), activation="relu", padding="same")(cnn_branch)
    cnn_branch = MaxPooling2D((2,2))(cnn_branch)
    cnn_embedding = Flatten()(cnn_branch)
    
    # GNN branch
    gnn_embedding = Dense(64, activation="relu")(gnn_input)
    gnn_embedding = Dense(32, activation="relu")(gnn_embedding)

    # Combine
    combined = Concatenate()([cnn_embedding, gnn_embedding])
    f = Dense(128, activation="relu")(combined)
    output = Dense(1, activation="linear", name="cnn_gnn_output")(f)
    
    model = Model(inputs=[cnn_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def build_meta_learner_model():
    # Takes predictions from the 3 base models as input
    pred1_input = Input(shape=(1,), name="pred1_input")
    pred2_input = Input(shape=(1,), name="pred2_input")
    pred3_input = Input(shape=(1,), name="pred3_input")

    # Concatenate the predictions
    combined = Concatenate()([pred1_input, pred2_input, pred3_input])
    
    # Simple MLP as the meta-learner
    f = Dense(32, activation="relu")(combined)
    f = Dense(16, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)
    
    model = Model(inputs=[pred1_input, pred2_input, pred3_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

# ==================== 6. Create Data Generators for Base Models ==================== #
# NOTE: We create generators that provide only the necessary inputs for each base model.
class CNNDropoutGenerator(DataGenerator):
    def __getitem__(self, index):
        (batch_cnn, batch_mlp, batch_gnn), batch_y = super().__getitem__(index)
        return (batch_cnn, batch_mlp), batch_y

class GNNDropoutGenerator(DataGenerator):
    def __getitem__(self, index):
        (batch_cnn, batch_mlp, batch_gnn), batch_y = super().__getitem__(index)
        return (batch_gnn, batch_mlp), batch_y

class MLPDropoutGenerator(DataGenerator):
    def __getitem__(self, index):
        (batch_cnn, batch_mlp, batch_gnn), batch_y = super().__getitem__(index)
        return (batch_cnn, batch_gnn), batch_y

def get_base_model_predictions(model, coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size, skip_raster_idx=None):
    """
    Generates predictions from a base model using a generator-like approach.
    Now includes an option to skip a raster channel for permutation importance.
    """
    num_samples = len(y)
    y_pred_list = []
    
    # Re-calculate these values locally to ensure they are available
    with rasterio.open(raster_paths[0]) as src:
        res_x, res_y = src.res
        buffer_pixels_x = int(buffer_meters / res_x)
        buffer_pixels_y = int(buffer_meters / res_y)
        patch_width = 2 * buffer_pixels_x
        patch_height = 2 * buffer_pixels_y

    for i in range(0, num_samples, batch_size):
        batch_coords = coords[i:i+batch_size]
        batch_mlp = mlp_data[i:i+batch_size]
        batch_gnn = gnn_data[i:i+batch_size, :]
        
        batch_cnn = extract_patch_for_generator(
            batch_coords, raster_paths, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height, skip_raster_idx=skip_raster_idx
        )
        
        input_names = [inp.name for inp in model.inputs]
        input_dict = {}
        if 'cnn_input' in input_names:
            input_dict['cnn_input'] = batch_cnn
        if 'mlp_input' in input_names:
            input_dict['mlp_input'] = batch_mlp
        if 'gnn_input' in input_names:
            input_dict['gnn_input'] = batch_gnn
            
        y_pred_list.append(model.predict(input_dict, verbose=0).flatten())
            
    return np.concatenate(y_pred_list)

def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (sMAPE).
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

def get_full_model_predictions(base_models, meta_model, coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size):
    """
    A helper function to get predictions from the full stacked ensemble model.
    """
    preds1_test = get_base_model_predictions(base_models['cnn_mlp'], coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size)
    preds2_test = get_base_model_predictions(base_models['gnn_mlp'], coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size)
    preds3_test = get_base_model_predictions(base_models['cnn_gnn'], coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size)
    
    meta_test_inputs = (preds1_test.reshape(-1, 1), preds2_test.reshape(-1, 1), preds3_test.reshape(-1, 1))
    
    return meta_model.predict(meta_test_inputs, verbose=0).flatten()

# ==================== NEW: Global Variables for LIME ====================
# These must be defined before the LIME wrapper function is defined.
with rasterio.open(raster_paths[0]) as src:
    res_x, res_y = src.res
    buffer_pixels_x = int(BUFFER_METERS / res_x)
    buffer_pixels_y = int(BUFFER_METERS / res_y)
    patch_width = 2 * buffer_pixels_x
    patch_height = 2 * buffer_pixels_y
    cnn_patch_shape = (patch_width, patch_height, len(raster_paths))
mlp_input_dim = mlp_train.shape[1]

# LIME requires a single, flat input. We'll explain the MLP features.
# A wrapper function to handle the complex model prediction for LIME
def predict_fn_for_lime(lime_data, sample_idx=0):
    """
    A wrapper function that takes LIME's perturbed data, reconstructs the
    inputs for the full model, and returns a prediction.
    It holds CNN and GNN inputs constant for a single sample while LIME perturbs MLP features.
    """
    # Create the full data inputs for the model for each LIME perturbation
    # We use a fixed CNN and GNN input, only varying the MLP features
    num_samples = lime_data.shape[0]
    
    # Get a single CNN and GNN input from the test set to use for all perturbations
    single_cnn_patch = extract_patch_for_generator(coords_test[sample_idx:sample_idx+1], raster_paths, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height)
    single_gnn_input = gnn_test[sample_idx:sample_idx+1]

    # Replicate the fixed inputs to match the number of perturbations
    cnn_input_batch = np.tile(single_cnn_patch, (num_samples, 1, 1, 1))
    gnn_input_batch = np.tile(single_gnn_input, (num_samples, 1))
    
    # Predict with base models
    preds1 = cnn_mlp_model.predict({'cnn_input': cnn_input_batch, 'mlp_input': lime_data}, verbose=0)
    preds2 = gnn_mlp_model.predict({'gnn_input': gnn_input_batch, 'mlp_input': lime_data}, verbose=0)
    preds3 = cnn_gnn_model.predict({'cnn_input': cnn_input_batch, 'gnn_input': gnn_input_batch}, verbose=0)
    
    meta_inputs = (preds1, preds2, preds3)
    final_pred = meta_model.predict(meta_inputs, verbose=0)
    return final_pred.flatten()


# Main execution block
print("\n" + "="*80)
print(f"Analyzing Stacked Deep Ensemble for BUFFER_METERS = {BUFFER_METERS}m")
print("="*80)

batch_size = 4
gnn_input_dim = len(coords_train)

# --- Train Base Models ---
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

print("\n--- Training CNN-MLP Base Model ---")
cnn_mlp_model = build_cnn_mlp_model(cnn_patch_shape, mlp_input_dim)
cnn_mlp_train_gen = CNNDropoutGenerator(
    coords=coords_train, mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    raster_paths=raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size, shuffle=True
)
cnn_mlp_model.fit(cnn_mlp_train_gen, epochs=100, verbose=1, callbacks=[early_stopping], validation_data=cnn_mlp_train_gen)

print("\n--- Training GNN-MLP Base Model ---")
gnn_mlp_model = build_gnn_mlp_model(gnn_input_dim, mlp_input_dim)
gnn_mlp_train_gen = GNNDropoutGenerator(
    coords=coords_train, mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    raster_paths=raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size, shuffle=True
)
gnn_mlp_model.fit(gnn_mlp_train_gen, epochs=100, verbose=1, callbacks=[early_stopping], validation_data=gnn_mlp_train_gen)

print("\n--- Training CNN-GNN Base Model ---")
cnn_gnn_model = build_cnn_gnn_model(cnn_patch_shape, gnn_input_dim)
cnn_gnn_train_gen = MLPDropoutGenerator(
    coords=coords_train, mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    raster_paths=raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size, shuffle=True
)
cnn_gnn_model.fit(cnn_gnn_train_gen, epochs=100, verbose=1, callbacks=[early_stopping], validation_data=cnn_gnn_train_gen)

# --- Generate predictions for meta-learner ---
preds1_train = get_base_model_predictions(cnn_mlp_model, coords_train, mlp_train, gnn_train, y_train, raster_paths, BUFFER_METERS, batch_size)
preds2_train = get_base_model_predictions(gnn_mlp_model, coords_train, mlp_train, gnn_train, y_train, raster_paths, BUFFER_METERS, batch_size)
preds3_train = get_base_model_predictions(cnn_gnn_model, coords_train, mlp_train, gnn_train, y_train, raster_paths, BUFFER_METERS, batch_size)
meta_train_inputs = (preds1_train.reshape(-1, 1), preds2_train.reshape(-1, 1), preds3_train.reshape(-1, 1))

# --- Train Meta-Learner ---
print("\n--- Training Meta-Learner Model ---")
meta_model = build_meta_learner_model()
meta_model.fit(meta_train_inputs, y_train, epochs=100, verbose=1, callbacks=[early_stopping], validation_split=0.2)

# Save models to disk to ensure they are available for LIME explainer
# This is a robust practice to avoid generator/thread issues
cnn_mlp_model.save('cnn_mlp_model.keras')
gnn_mlp_model.save('gnn_mlp_model.keras')
cnn_gnn_model.save('cnn_gnn_model.keras')
meta_model.save('meta_model.keras')

del cnn_mlp_model, gnn_mlp_model, cnn_gnn_model, meta_model
gc.collect()

# Reload the models for a clean evaluation environment
cnn_mlp_model = load_model('cnn_mlp_model.keras')
gnn_mlp_model = load_model('gnn_mlp_model.keras')
cnn_gnn_model = load_model('cnn_gnn_model.keras')
meta_model = load_model('meta_model.keras')

base_models = {'cnn_mlp': cnn_mlp_model, 'gnn_mlp': gnn_mlp_model, 'cnn_gnn': cnn_gnn_model}

# --- Evaluate with Meta-Learner ---
y_pred = get_full_model_predictions(base_models, meta_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)
r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
smape_test = smape(y_test, y_pred)

print(f"\n Stacked Deep Ensemble Model Performance ({BUFFER_METERS}m):")
print(f"R² Test: {r2_test:.4f} | RMSE Test: {rmse_test:.4f}")
print(f"MAE Test: {mae_test:.4f} | sMAPE Test: {smape_test:.4f}%")

Using 26 raster layers for CNN input.
  - bui.tif
  - ndsi.tif
  - savi.tif
  - ndbsi.tif
  - ui.tif
  - ndwi.tif
  - ndbi.tif
  - awei.tif
  - evi.tif
  - mndwi.tif
  - ndvi.tif
  - LULC2020.tif
  - LULC2021.tif
  - LULC2022.tif
  - LULC2019.tif
  - LULC2018.tif
  - LULC2017.tif
  - ClayW.tif
  - CdW.tif
  - SandW.tif
  - SiltW.tif
  - AsW.tif
  - CrW.tif
  - NiW.tif
  - PbW.tif
  - CuW.tif

Analyzing Stacked Deep Ensemble for BUFFER_METERS = 500m

--- Training CNN-MLP Base Model ---
Epoch 1/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 201ms/step - loss: 45272.9375 - val_loss: 12972.9746
Epoch 2/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 208ms/step - loss: 10761.8857 - val_loss: 7885.7622
Epoch 3/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 193ms/step - loss: 5899.0371 - val_loss: 5576.6270
Epoch 4/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 204ms/step - loss: 5391.0962 - val_loss: 5643.5581

# AlphaEarth Integration Enabled

This notebook has been enhanced with AlphaEarth satellite embeddings.

## Integration Options:
- **Option A**: Replace indices with AlphaEarth (64 bands)
- **Option B**: Add AlphaEarth to features (RECOMMENDED)
- **Option C**: PCA-reduced AlphaEarth (20 components)
- **Option D**: MLP enhancement only

Expected improvement: +0.5% to +0.8% in R²

In [None]:
# ==================== ALPHAEARTH CONFIGURATION ====================import pandas as pdimport numpy as npimport os# Select which AlphaEarth option to useALPHA_EARTH_OPTION = 'B'  # Options: A, B (recommended), C, DUSE_ALPHA_EARTH = True# Paths to AlphaEarth data files (created by 00_AlphaEarth_Data_Preparation.ipynb)option_file = f'Option_{ALPHA_EARTH_OPTION}_RainyAE.csv'  # or WinterAE# Load AlphaEarth dataif os.path.exists(option_file):    ae_data = pd.read_csv(option_file)    print(f'Loaded AlphaEarth Option {ALPHA_EARTH_OPTION}')    print(f'Shape: {ae_data.shape}')else:    print(f'WARNING: {option_file} not found')    print('Please run 00_AlphaEarth_Data_Preparation.ipynb first')    USE_ALPHA_EARTH = False

In [5]:
# ==================== NEW: Feature Importance Analysis ==================== #
print("\n" + "="*80)
print(f"Feature Importance (Permutation-based & LIME)")
print("="*80)

# ----------------- 1. Permutation-based Feature Importance -----------------
# Permutation importance for the final meta-learner predictions
print("\n--- Permutation Importance for Ensemble Inputs ---")
baseline_r2 = r2_test

# Importance of CNN-MLP predictions
preds1_test = get_base_model_predictions(cnn_mlp_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)
preds2_test = get_base_model_predictions(gnn_mlp_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)
preds3_test = get_base_model_predictions(cnn_gnn_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)
preds1_test_shuffled = np.copy(preds1_test)
np.random.shuffle(preds1_test_shuffled)
shuffled_test_inputs = (preds1_test_shuffled.reshape(-1, 1), preds2_test.reshape(-1, 1), preds3_test.reshape(-1, 1))
y_pred_shuffled = meta_model.predict(shuffled_test_inputs, verbose=0).flatten()
r2_shuffled = r2_score(y_test, y_pred_shuffled)
importance_cnn_mlp = baseline_r2 - r2_shuffled
print(f"Importance of CNN-MLP predictions (R² drop): {importance_cnn_mlp:.4f}")

# Importance of GNN-MLP predictions
preds2_test_shuffled = np.copy(preds2_test)
np.random.shuffle(preds2_test_shuffled)
shuffled_test_inputs = (preds1_test.reshape(-1, 1), preds2_test_shuffled.reshape(-1, 1), preds3_test.reshape(-1, 1))
y_pred_shuffled = meta_model.predict(shuffled_test_inputs, verbose=0).flatten()
r2_shuffled = r2_score(y_test, y_pred_shuffled)
importance_gnn_mlp = baseline_r2 - r2_shuffled
print(f"Importance of GNN-MLP predictions (R² drop): {importance_gnn_mlp:.4f}")

# Importance of CNN-GNN predictions
preds3_test_shuffled = np.copy(preds3_test)
np.random.shuffle(preds3_test_shuffled)
shuffled_test_inputs = (preds1_test.reshape(-1, 1), preds2_test.reshape(-1, 1), preds3_test_shuffled.reshape(-1, 1))
y_pred_shuffled = meta_model.predict(shuffled_test_inputs, verbose=0).flatten()
r2_shuffled = r2_score(y_test, y_pred_shuffled)
importance_cnn_gnn = baseline_r2 - r2_shuffled
print(f"Importance of CNN-GNN predictions (R² drop): {importance_cnn_gnn:.4f}")

# Permutation importance for original features (MLP and Rasters)
print("\n--- Permutation Importance for Original Features ---")

# MLP Feature Importance
mlp_importance = {}
mlp_test_copy = mlp_test.copy()
for i, col in enumerate(numeric_cols):
    mlp_test_shuffled = mlp_test_copy.copy()
    np.random.shuffle(mlp_test_shuffled[:, i])
    
    # Calculate predictions with the shuffled feature
    preds1_test_shuffled = get_base_model_predictions(cnn_mlp_model, coords_test, mlp_test_shuffled, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)
    preds2_test_shuffled = get_base_model_predictions(gnn_mlp_model, coords_test, mlp_test_shuffled, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)
    preds3_test_shuffled = get_base_model_predictions(cnn_gnn_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)

    meta_test_inputs_shuffled = (preds1_test_shuffled.reshape(-1, 1), preds2_test_shuffled.reshape(-1, 1), preds3_test_shuffled.reshape(-1, 1))
    y_pred_shuffled = meta_model.predict(meta_test_inputs_shuffled, verbose=0).flatten()
    r2_shuffled = r2_score(y_test, y_pred_shuffled)
    importance = baseline_r2 - r2_shuffled
    mlp_importance[col] = importance
    print(f"Importance of MLP feature '{col}' (R² drop): {importance:.4f}")

# Raster Layer Importance (CNN)
raster_importance = {}
print("\n--- Permutation Importance for Raster Layers ---")
for i, r_path in enumerate(raster_paths):
    # Calculate predictions by zeroing out one raster channel
    preds1_test_shuffled = get_base_model_predictions(cnn_mlp_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size, skip_raster_idx=i)
    preds2_test_shuffled = get_base_model_predictions(gnn_mlp_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size)
    preds3_test_shuffled = get_base_model_predictions(cnn_gnn_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, batch_size, skip_raster_idx=i)
    
    meta_test_inputs_shuffled = (preds1_test_shuffled.reshape(-1, 1), preds2_test_shuffled.reshape(-1, 1), preds3_test_shuffled.reshape(-1, 1))
    y_pred_shuffled = meta_model.predict(meta_test_inputs_shuffled, verbose=0).flatten()
    r2_shuffled = r2_score(y_test, y_pred_shuffled)
    importance = baseline_r2 - r2_shuffled
    raster_name = os.path.basename(r_path)
    raster_importance[raster_name] = importance
    print(f"Importance of Raster '{raster_name}' (R² drop): {importance:.4f}")

# ----------------- 2. LIME Feature Importance (Top 10) -----------------
print("\n--- LIME Feature Importance (Top 10) ---")

# Prepare data and explainer for LIME
# The feature names are the numeric columns from the MLP branch
feature_names = list(numeric_cols)
# The data for the explainer is the MLP test data
lime_data = mlp_test
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=lime_data,
    feature_names=feature_names,
    class_names=['RI'],
    mode='regression',
    discretize_continuous=False
)

# Explain a single random sample from the test set
random_idx = np.random.randint(0, len(y_test))
print(f"Explaining prediction for sample index {random_idx}:")
exp = explainer.explain_instance(
    data_row=lime_data[random_idx],
    predict_fn=predict_fn_for_lime,
    num_features=20
)

# Print the top 10 features and their importance
print("Top 10 features for this prediction:")
for feature, weight in exp.as_list():
    print(f" - {feature}: {weight:.4f}")

# To visualize the explanation (optional, requires a display)
# exp.show_in_notebook(show_table=True, show_all=False)



Feature Importance (Permutation-based & LIME)

--- Permutation Importance for Ensemble Inputs ---
Importance of CNN-MLP predictions (R² drop): 0.3140
Importance of GNN-MLP predictions (R² drop): 0.0532
Importance of CNN-GNN predictions (R² drop): 0.7711

--- Permutation Importance for Original Features ---
Importance of MLP feature 'hydro_dist_brick' (R² drop): 0.0000
Importance of MLP feature 'num_brick_field' (R² drop): -0.0043
Importance of MLP feature 'hydro_dist_ind' (R² drop): -0.0000
Importance of MLP feature 'num_industry' (R² drop): 0.0079
Importance of MLP feature 'CrW' (R² drop): -0.0002
Importance of MLP feature 'NiW' (R² drop): 0.0059
Importance of MLP feature 'CuW' (R² drop): 0.0070
Importance of MLP feature 'AsW' (R² drop): 0.0000
Importance of MLP feature 'CdW' (R² drop): -0.0078
Importance of MLP feature 'PbW' (R² drop): -0.0038
Importance of MLP feature 'MW' (R² drop): 0.0069
Importance of MLP feature 'SandW' (R² drop): 0.0093
Importance of MLP feature 'SiltW' (R² dr