In [None]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout, Layer, MultiHeadAttention, LayerNormalization, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import gc # Import garbage collector
import sys
from io import StringIO
import pickle # Import the pickle library for saving objects

# Define the single buffer size to use
BUFFER_METERS = 500

# ==================== 1. Load Data ==================== #
orig = pd.read_csv("../../data/RainySeason.csv")
river_100 = pd.read_csv("../data/Samples_100.csv")

drop_cols = ['Stations','River','Lat','Long','geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')

# Train-test split
train_orig = orig.sample(10, random_state=42)
test_orig = orig.drop(train_orig.index)
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
# We are not using rasters in this GNN-MLP model, but the paths are still
# defined for consistency with previous versions.
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDW/*.tif")

print("Note: Raster data is not used in this GNN-MLP model.")

# ==================== 3. Create a Custom Data Generator ==================== #
class DataGenerator(Sequence):
    def __init__(self, mlp_data, gnn_data, y, batch_size=4, shuffle=True, **kwargs):
        super().__init__(**kwargs)
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_mlp = self.mlp_data[batch_indices]
        batch_gnn = self.gnn_data[batch_indices, :]
        batch_y = self.y[batch_indices]
        
        return (batch_mlp, batch_gnn), batch_y

# ==================== 4. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined[['Long','Lat']].values
coords_test = test_orig[['Long','Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
mlp_train = scaler.fit_transform(train_combined[numeric_cols])
mlp_test = scaler.transform(test_orig[numeric_cols])
y_train = train_combined['RI'].values
y_test = test_orig['RI'].values

# ==================== 5. Define GNN-MLP Fusion Model ==================== #
def build_gnn_mlp_model(mlp_dim, gnn_dim):
    # Inputs for all branches
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    
    # --- MLP Branch ---
    mlp_embedding = Dense(128, activation="relu")(mlp_input)
    mlp_embedding = Dense(64, activation="relu", name="mlp_embedding")(mlp_embedding)

    # --- GNN Branch ---
    gnn_embedding = Dense(128, activation="relu")(gnn_input)
    gnn_embedding = Dense(64, activation="relu", name="gnn_embedding")(gnn_embedding)

    # --- Concatenate Embeddings ---
    combined = Concatenate()([mlp_embedding, gnn_embedding])
    
    # Final dense layers for prediction
    f = Dense(128, activation="relu")(combined)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)

    # Build and compile the model
    model = Model(inputs=[mlp_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def evaluate_model(model, mlp_test, gnn_test_matrix, y_test, return_preds=False):
    """
    Evaluates the model on given data and returns R², RMSE, and predictions.
    """
    y_pred = model.predict((mlp_test, gnn_test_matrix)).flatten()
    
    if return_preds:
        return y_pred
    else:
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        return r2, rmse

def calculate_permutation_importance(model, mlp_data, gnn_data, y_true):
    """
    Calculates permutation feature importance for the MLP and GNN branches.
    """
    print("\nStarting Permutation Feature Importance Analysis...")
    # Get baseline R² on the unshuffled data
    baseline_r2, _ = evaluate_model(model, mlp_data, gnn_data, y_true)
    print(f"Baseline R² on test set: {baseline_r2:.4f}")

    importance = {}
    
    # Permute MLP input
    shuffled_mlp_data = mlp_data.copy()
    np.random.shuffle(shuffled_mlp_data)
    shuffled_r2, _ = evaluate_model(model, shuffled_mlp_data, gnn_data, y_true)
    importance['MLP'] = baseline_r2 - shuffled_r2

    # Permute GNN input
    shuffled_gnn_data = gnn_data.copy()
    np.random.shuffle(shuffled_gnn_data)
    shuffled_r2, _ = evaluate_model(model, mlp_data, shuffled_gnn_data, y_true)
    importance['GNN'] = baseline_r2 - shuffled_r2

    return importance
        
# ==================== Run the Analysis ==================== #
# Redirect output to a string for later saving
old_stdout = sys.stdout
sys.stdout = captured_output = StringIO()

print("\n" + "="*80)
print(f"Analyzing GNN-MLP Fusion Model")
print("="*80)

batch_size = 4
gnn_input_dim = len(coords_train)
mlp_input_dim = mlp_train.shape[1]

model = build_gnn_mlp_model(mlp_input_dim, gnn_input_dim)
model.summary()

# ==================== 6. Create Data Generators ==================== #
train_generator = DataGenerator(
    mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    batch_size=batch_size, shuffle=True
)

# ==================== 7. Train Model ==================== #
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    train_generator,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping],
    validation_data=train_generator
)

# ==================== 8. Evaluate & Perform Feature Importance ==================== #
# Predict on the training data using the generator
y_pred_train = model.predict(train_generator).flatten()
r2_train = r2_score(y_train[:len(y_pred_train)], y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train[:len(y_pred_train)], y_pred_train))

# Evaluate on the test data using the updated function
r2_test, rmse_test = evaluate_model(model, mlp_test, gnn_test, y_test)
y_pred_test = evaluate_model(model, mlp_test, gnn_test, y_test, return_preds=True)

print(f"\n GNN-MLP Fusion Model Performance:")
print(f"R² Train: {r2_train:.4f} | RMSE Train: {rmse_train:.4f}")
print(f"R² Test: {r2_test:.4f} | RMSE Test: {rmse_test:.4f}")

# Calculate and print feature importance
feature_importance = calculate_permutation_importance(model, mlp_test, gnn_test, y_test)
print("\n--- Feature Importance (Permutation) ---")
sorted_importance = sorted(feature_importance.items(), key=lambda item: item[1], reverse=True)
for feature, score in sorted_importance:
    print(f"{feature}: {score:.4f}")

# ==================== 9. Save all info to a folder ==================== #
# Restore standard output
sys.stdout = old_stdout
printed_output = captured_output.getvalue()

output_folder = "gnn_mlp"
os.makedirs(output_folder, exist_ok=True)
print(f"\nCreating folder: '{output_folder}' and saving results...")

# Save the model
model_path = os.path.join(output_folder, "gnn_mlp_model.keras")
model.save(model_path)
print(f"Model saved to: {model_path}")

# Save the predictions and true labels
np.save(os.path.join(output_folder, "y_train.npy"), y_train)
np.save(os.path.join(output_folder, "y_test.npy"), y_test)
np.save(os.path.join(output_folder, "y_pred_train.npy"), y_pred_train)
np.save(os.path.join(output_folder, "y_pred_test.npy"), y_pred_test)
print(f"Predictions and true labels saved as .npy files.")

# Save the printed output to a text file
output_path = os.path.join(output_folder, "analysis_output.txt")
with open(output_path, "w") as f:
    f.write(printed_output)
print(f"Analysis results saved to: {output_path}")

# Save the feature importance dictionary as a .pkl file
importance_path = os.path.join(output_folder, "feature_importance.pkl")
with open(importance_path, 'wb') as f:
    pickle.dump(feature_importance, f)
print(f"Feature importance results saved to: {importance_path}")

print("\nAll information successfully saved.")

# Garbage collect to free up memory now that everything is saved
del model, history, train_generator
gc.collect()


# AlphaEarth Integration Enabled

This notebook has been enhanced with AlphaEarth satellite embeddings.

## Integration Options:
- **Option A**: Replace indices with AlphaEarth (64 bands)
- **Option B**: Add AlphaEarth to features (RECOMMENDED)
- **Option C**: PCA-reduced AlphaEarth (20 components)
- **Option D**: MLP enhancement only

Expected improvement: +0.5% to +0.8% in R²

In [None]:
# ==================== ALPHAEARTH CONFIGURATION ====================
import pandas as pd
import numpy as np
import os

# Select which AlphaEarth option to use
ALPHA_EARTH_OPTION = 'B'  # Options: A, B (recommended), C, D
USE_ALPHA_EARTH = True

# Paths to AlphaEarth data files (created by 00_AlphaEarth_Data_Preparation.ipynb)
option_file = f'Option_{ALPHA_EARTH_OPTION}_RainyAE.csv'  # or WinterAE

# Load AlphaEarth data
if os.path.exists(option_file):
    ae_data = pd.read_csv(option_file)
    print(f'Loaded AlphaEarth Option {ALPHA_EARTH_OPTION}')
    print(f'Shape: {ae_data.shape}')
else:
    print(f'WARNING: {option_file} not found')
    print('Please run 00_AlphaEarth_Data_Preparation.ipynb first')
    USE_ALPHA_EARTH = False


In [6]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout, Layer, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import gc # Import garbage collector
import sys
import pickle # Import the pickle library for saving objects
# Set a consistent seed for reproducibility
tf.random.set_seed(42)
np.random.seed(42)
# ==================== 1. Load Data ==================== #
# NOTE: This script assumes the following file paths are correct.
try:
    orig = pd.read_csv("../../data/RainySeason.csv")
    river_100 = pd.read_csv("../data/Samples_100.csv")
except FileNotFoundError as e:
    print(f"Error: Required data file not found. Please check your file paths.")
    print(f"Details: {e}")
    sys.exit()
drop_cols = ['Stations','River','Lat','Long','geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')
# ==================== 2. Collect ALL Rasters and Metadata ==================== #
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDW/*.tif")
# Get the pixel resolution from the first raster to set a uniform patch size
try:
    with rasterio.open(raster_paths[0]) as src:
        pixel_size = src.transform.a
except IndexError:
    print("Error: No raster files found in the specified directories.")
    sys.exit()
# Create a dictionary to store raster metadata for fast access
raster_metadata = {}
for path in raster_paths:
    with rasterio.open(path) as src:
        raster_metadata[path] = {
            'transform': src.transform,
            'crs': src.crs,
            'width': src.width,
            'height': src.height
        }
# ==================== 3. Define a Custom Data Generator ==================== #
class DataGenerator(Sequence):
    """
    Custom Keras Sequence for generating batches of data.
    Handles three different input types: MLP features, GNN features,
    and raster image patches, loading rasters on-the-fly to save memory.
    """
    def __init__(self, mlp_data, gnn_data, y, coords, raster_paths, buffer_radius_m, pixel_size, batch_size=4, shuffle=True):
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.coords = coords
        self.raster_paths = raster_paths
        # Calculate the uniform patch size in pixels based on the buffer radius and pixel size
        # We need a square patch, so the size is 2 * radius / pixel_size
        self.patch_size = int(round((2 * buffer_radius_m) / pixel_size))
        # Ensure patch size is at least 1 and is an even number for easy centering
        if self.patch_size % 2 != 0:
            self.patch_size += 1
        self.patch_size = max(self.patch_size, 2)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.on_epoch_end()
    
    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))
        
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def get_raster_patches(self, coords_batch):
        """
        Extracts a patch of raster data for each coordinate in the batch.
        Loads rasters on-the-fly to save memory and robustly handles boundaries.
        """
        patches_for_rasters = []
        for path in self.raster_paths:
            patches_for_this_raster = []
            try:
                with rasterio.open(path) as src:
                    for lon, lat in coords_batch:
                        # Get pixel coordinates
                        row, col = src.index(lon, lat)
                        
                        # Define a window to read around the pixel, handling boundaries
                        half_patch = self.patch_size // 2
                        left = int(col - half_patch)
                        top = int(row - half_patch)
                        right = int(col + half_patch)
                        bottom = int(row + half_patch)
                        # Create a new, empty array for the final padded patch
                        padded_patch = np.zeros((self.patch_size, self.patch_size), dtype='float32')
                        # Calculate the window in the raster's coordinate space to read from
                        # And the offset in the padded_patch to write to
                        read_left = max(0, left)
                        read_top = max(0, top)
                        read_right = min(src.width, right)
                        read_bottom = min(src.height, bottom)
                        # Check if the calculated window has a valid size
                        read_width = read_right - read_left
                        read_height = read_bottom - read_top
                        
                        if read_width > 0 and read_height > 0:
                            write_left = read_left - left
                            write_top = read_top - top
                            write_right = write_left + read_width
                            write_bottom = write_top + read_height
                            # Create the window object for rasterio to read from
                            window = Window(read_left, read_top, read_width, read_height)
                            # Read the data from the raster
                            patch_data = src.read(1, window=window)
                            # Place the read data into the padded patch
                            padded_patch[write_top:write_bottom, write_left:write_right] = patch_data
                        
                        patches_for_this_raster.append(padded_patch)
            
                # Stack the patches for this raster
                patches_for_rasters.append(np.stack(patches_for_this_raster, axis=0))
            except Exception as e:
                # This handles cases where a raster file might be missing or corrupted
                patches_for_rasters.append(np.zeros((len(coords_batch), self.patch_size, self.patch_size), dtype='float32'))
        # Stack all raster patches together
        final_patches = np.stack(patches_for_rasters, axis=-1)
        return final_patches
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        # Get batch data
        batch_mlp = self.mlp_data[batch_indices]
        batch_gnn = self.gnn_data[batch_indices, :]
        batch_y = self.y[batch_indices]
        batch_coords = self.coords[batch_indices]
        
        # Get raster data for the current batch
        batch_rasters = self.get_raster_patches(batch_coords)
        
        # Return a dictionary of inputs and the output
        return {"mlp_input": batch_mlp, "gnn_input": batch_gnn, "raster_input": batch_rasters}, batch_y
# ==================== 4. Define GNN-MLP-Raster Fusion Model ==================== #
def build_fusion_model(mlp_dim, gnn_dim, raster_patch_size, num_rasters):
    """
    Builds the multi-input Keras model with branches for MLP, GNN, and Rasters.
    """
    # Inputs for all branches
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    raster_input = Input(shape=(raster_patch_size, raster_patch_size, num_rasters), name="raster_input")
    # --- MLP Branch ---
    mlp_embedding = Dense(128, activation="relu")(mlp_input)
    mlp_embedding = Dense(64, activation="relu", name="mlp_embedding")(mlp_embedding)
    # --- GNN Branch ---
    gnn_embedding = Dense(128, activation="relu")(gnn_input)
    gnn_embedding = Dense(64, activation="relu", name="gnn_embedding")(gnn_embedding)
    
    # --- Raster Branch (using a simple CNN) ---
    raster_conv = Conv2D(32, (3, 3), activation="relu")(raster_input)
    raster_pool = MaxPooling2D((2, 2))(raster_conv)
    raster_conv = Conv2D(64, (3, 3), activation="relu")(raster_pool)
    raster_pool = MaxPooling2D((2, 2))(raster_conv)
    raster_flatten = Flatten()(raster_pool)
    raster_embedding = Dense(64, activation="relu", name="raster_embedding")(raster_flatten)
    # --- Concatenate Embeddings ---
    combined = Concatenate()([mlp_embedding, gnn_embedding, raster_embedding])
    
    # Final dense layers for prediction
    f = Dense(128, activation="relu")(combined)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)
    # Build and compile the model
    model = Model(inputs=[mlp_input, gnn_input, raster_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model
# ==================== 5. Define Evaluation & Importance Functions ==================== #
def calculate_smape(y_true, y_pred):
    """Calculates Symmetric Mean Absolute Percentage Error (SMAPE)."""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # Avoid division by zero
    mask = denominator == 0
    smape_val = np.where(mask, 0, numerator / denominator)
    return 100 * np.mean(smape_val)
def evaluate_model(model, data_inputs, y_test, return_preds=False):
    """
    Evaluates the model on given data and returns R², RMSE, MAE, and SMAPE.
    Handles both Keras Generators and direct numpy arrays.
    """
    if isinstance(data_inputs, DataGenerator):
        y_pred = model.predict(data_inputs, verbose=0).flatten()
    else:
        y_pred = model.predict(data_inputs, verbose=0).flatten()
    
    if return_preds:
        return y_pred
    else:
        # Align true labels with predictions if using a generator
        y_true_aligned = y_test[:len(y_pred)]
        r2 = r2_score(y_true_aligned, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true_aligned, y_pred))
        mae = mean_absolute_error(y_true_aligned, y_pred)
        smape = calculate_smape(y_true_aligned, y_pred)
        return r2, rmse, mae, smape
def calculate_permutation_importance(model, mlp_data, gnn_data, raster_data, y_true, mlp_features, raster_features):
    """
    Calculates permutation feature importance for all individual features.
    """
    print("\nStarting Permutation Feature Importance Analysis...")
    
    # Create the combined input for the model
    initial_inputs = {"mlp_input": mlp_data, "gnn_input": gnn_data, "raster_input": raster_data}
    
    # Get baseline R² on the unshuffled data
    baseline_r2, _, _, _ = evaluate_model(model, initial_inputs, y_true)
    print(f"Baseline R²: {baseline_r2:.4f}")
    
    importance = {}
    
    # 1. Permute individual MLP features
    print("Permuting MLP features...")
    for i, feature in enumerate(mlp_features):
        shuffled_mlp_data = mlp_data.copy()
        np.random.shuffle(shuffled_mlp_data[:, i])
        shuffled_inputs = {"mlp_input": shuffled_mlp_data, "gnn_input": gnn_data, "raster_input": raster_data}
        shuffled_r2, _, _, _ = evaluate_model(model, shuffled_inputs, y_true)
        importance[f'MLP_{feature}'] = baseline_r2 - shuffled_r2
    
    # 2. Permute GNN input
    print("Permuting GNN features...")
    shuffled_gnn_data = gnn_data.copy()
    np.random.shuffle(shuffled_gnn_data)
    shuffled_inputs = {"mlp_input": mlp_data, "gnn_input": shuffled_gnn_data, "raster_input": raster_data}
    shuffled_r2, _, _, _ = evaluate_model(model, shuffled_inputs, y_true)
    importance['GNN'] = baseline_r2 - shuffled_r2
    
    # 3. Permute Raster inputs
    print("Permuting Raster features...")
    for i, feature in enumerate(raster_features):
        shuffled_raster_data = raster_data.copy()
        # Shuffle a single channel (raster band)
        shuffled_raster_data[:, :, :, i] = np.random.permutation(shuffled_raster_data[:, :, :, i].flatten()).reshape(shuffled_raster_data.shape[0], shuffled_raster_data.shape[1], shuffled_raster_data.shape[2])
        shuffled_inputs = {"mlp_input": mlp_data, "gnn_input": gnn_data, "raster_input": shuffled_raster_data}
        shuffled_r2, _, _, _ = evaluate_model(model, shuffled_inputs, y_true)
        importance[f'Raster_{os.path.basename(feature)}'] = baseline_r2 - shuffled_r2
        
    return importance
# ==================== 6. Main Analysis with Train/Test CV ==================== #
print("\n" + "="*80)
print(f"Analyzing GNN-MLP-Raster Fusion Model with 5-Fold Single Split")
print(f"Using a uniform patch size of {int(round((2 * 500) / pixel_size))} pixels for a 500m buffer.")
print("="*80)
# Combine all data for Train/Test splitting
full_data = pd.concat([orig, river_100], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
full_coords = full_data[['Long','Lat']].values
full_y = full_data['RI'].values
full_mlp_data = full_data[numeric_cols].values
full_raster_data = full_coords # This will be processed by the generator
# Pre-process MLP data with StandardScaler
scaler = StandardScaler()
full_mlp_data = scaler.fit_transform(full_mlp_data)
# Train/Test setup
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_results = []
all_feature_importances = {}
buffer_radius_m = 500
raster_patch_size = int(round((2 * buffer_radius_m) / pixel_size))
if raster_patch_size % 2 != 0:
    raster_patch_size += 1
raster_patch_size = max(raster_patch_size, 2)
num_rasters = len(raster_paths)
for fold, (train_index, test_index) in enumerate(kf.split(full_data)):
    print(f"\n--- Starting Fold {fold+1}/{n_splits} ---")
    
    # Get train and test data for this fold
    train_mlp, test_mlp = full_mlp_data[train_index], full_mlp_data[test_index]
    train_coords, test_coords = full_coords[train_index], full_coords[test_index]
    y_train, y_test = full_y[train_index], full_y[test_index]
    
    # Prepare GNN input (adjacency matrix based on distances)
    dist_mat_train = distance_matrix(train_coords, train_coords)
    gnn_train = np.exp(-dist_mat_train / 10)
    
    dist_mat_test_train = distance_matrix(test_coords, train_coords)
    gnn_test = np.exp(-dist_mat_test_train / 10)
    # Clean up memory
    del dist_mat_train, dist_mat_test_train
    gc.collect()
    # Re-build and compile the model for each fold
    model = build_fusion_model(mlp_dim=train_mlp.shape[1], gnn_dim=gnn_train.shape[1], 
                               raster_patch_size=raster_patch_size, num_rasters=num_rasters)
    
    if fold == 0:
        model.summary()
    
    # Create data generators
    train_generator = DataGenerator(
        mlp_data=train_mlp, gnn_data=gnn_train, y=y_train, coords=train_coords,
        raster_paths=raster_paths, buffer_radius_m=buffer_radius_m, pixel_size=pixel_size, batch_size=4, shuffle=True
    )
    test_generator = DataGenerator(
        mlp_data=test_mlp, gnn_data=gnn_test, y=y_test, coords=test_coords,
        raster_paths=raster_paths, buffer_radius_m=buffer_radius_m, pixel_size=pixel_size, batch_size=4, shuffle=False
    )
    
    # Train the model
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    history = model.fit(
        train_generator,
        epochs=100,
        verbose=1,
        callbacks=[early_stopping],
        validation_data=test_generator
    )
    # Evaluate on the test data
    r2_test, rmse_test, mae_test, smape_test = evaluate_model(model, test_generator, y_test)
    fold_results.append({'R2': r2_test, 'RMSE': rmse_test, 'MAE': mae_test, 'SMAPE': smape_test})
    
    print(f"Fold {fold+1} Test Metrics:")
    print(f"R²: {r2_test:.4f} | RMSE: {rmse_test:.4f} | MAE: {mae_test:.4f} | SMAPE: {smape_test:.4f}%")
    # Calculate and store feature importance for this fold
    # Get all test data as numpy arrays for importance calculation
    test_mlp_full = test_generator.mlp_data
    test_gnn_full = test_generator.gnn_data
    test_y_full = test_generator.y
    test_coords_full = test_generator.coords
    
    # Create a single batch for raster data
    test_rasters_full = test_generator.get_raster_patches(test_coords_full)
    
    importance = calculate_permutation_importance(model, test_mlp_full, test_gnn_full, test_rasters_full, test_y_full, numeric_cols, raster_paths)
    for feature, score in importance.items():
        if feature not in all_feature_importances:
            all_feature_importances[feature] = []
        all_feature_importances[feature].append(score)
    del model, history, train_generator, test_generator
    gc.collect()
# Calculate and print final averages
avg_results = pd.DataFrame(fold_results).mean()
print("\n" + "="*80)
print(f"Final Single Split Results (Averaged over {n_splits} folds):")
print("="*80)
print(f"Average R²: {avg_results['R2']:.4f}")
print(f"Average RMSE: {avg_results['RMSE']:.4f}")
print(f"Average MAE: {avg_results['MAE']:.4f}")
print(f"Average SMAPE: {avg_results['SMAPE']:.4f}%")
# Calculate and print average feature importance
print("\n--- Average Feature Importance (Permutation) ---")
avg_importance = {k: np.mean(v) for k, v in all_feature_importances.items()}
sorted_importance = sorted(avg_importance.items(), key=lambda item: item[1], reverse=True)
for feature, score in sorted_importance:
    print(f"{feature}: {score:.4f}")
# ==================== 7. Save all info to a folder ==================== #
# NOTE: Removed the file saving functionality as requested. The output is now
# printed directly to the console.
print("\nAnalysis complete. Results are printed above.")



Analyzing GNN-MLP-Raster Fusion Model with 5-Fold Cross-Validation
Using a uniform patch size of 100 pixels for a 500m buffer.

--- Starting Fold 1/5 ---


Epoch 1/100


  self._warn_if_super_not_called()


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - loss: 888437.6875 - val_loss: 24513.5371
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 28460.4863 - val_loss: 8236.4229
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 73215.8203 - val_loss: 3990.4648
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 16451.0371 - val_loss: 3824.1907
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 9662.1064 - val_loss: 2485.3337
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 17893.7188 - val_loss: 1866.4838
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 8753.3789 - val_loss: 3065.6008
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 7144.3447 - val_loss: 2154.11

  self._warn_if_super_not_called()


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - loss: 143420.5781 - val_loss: 462871.4062
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 88884.4453 - val_loss: 12024.0039
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 21645.1719 - val_loss: 10728.4385
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 8766.5488 - val_loss: 3293.3503
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 15319.0732 - val_loss: 1891.7793
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - loss: 4551.8501 - val_loss: 2138.1482
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - loss: 3043.9766 - val_loss: 1711.3531
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - loss: 5444.2852 - val_loss: 1492.

  self._warn_if_super_not_called()


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - loss: 594712.0000 - val_loss: 67780.6719
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 256101.7969 - val_loss: 41270.5977
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 22654.4727 - val_loss: 24764.8750
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 15564.7314 - val_loss: 20555.1797
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - loss: 5839.1016 - val_loss: 17465.2812
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 6442.4800 - val_loss: 13395.2500
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 7179.7832 - val_loss: 8981.3965
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 13172.3975 - val_loss: 4

  self._warn_if_super_not_called()


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - loss: 227258.7500 - val_loss: 302643.7500
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 60661.6797 - val_loss: 13965.1123
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 25852.3594 - val_loss: 3237.5366
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 6948.4531 - val_loss: 2608.9937
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 18352.6289 - val_loss: 2660.5239
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 4965.4980 - val_loss: 1824.3904
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - loss: 4126.8726 - val_loss: 1419.0815
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 5822.2954 - val_loss: 1475.3

  self._warn_if_super_not_called()


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - loss: 145525.4531 - val_loss: 19765.9160
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 132560.4062 - val_loss: 7110.1416
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 33191.6094 - val_loss: 7673.8955
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 14137.4346 - val_loss: 3024.1177
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 6926.7793 - val_loss: 3003.2993
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - loss: 6690.0371 - val_loss: 1931.4629
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 4265.2925 - val_loss: 1015.4894
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 6501.6504 - val_loss: 1078.91

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout, Layer, MultiHeadAttention, LayerNormalization, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import gc # Import garbage collector
import sys
from io import StringIO
import pickle # Import the pickle library for saving objects

# Define the single buffer size to use
BUFFER_METERS = 500

# ==================== 1. Load Data ==================== #
orig = pd.read_csv("../../data/RainySeason.csv")
river_100 = pd.read_csv("../data/Samples_100.csv")

drop_cols = ['Stations','River','Lat','Long','geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')

# Train-test split
train_orig = orig.sample(10, random_state=42)
test_orig = orig.drop(train_orig.index)
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
# We are not using rasters in this GNN-MLP model, but the paths are still
# defined for consistency with previous versions.
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDW/*.tif")

print("Note: Raster data is not used in this GNN-MLP model.")

# ==================== 3. Create a Custom Data Generator ==================== #
class DataGenerator(Sequence):
    def __init__(self, mlp_data, gnn_data, y, batch_size=4, shuffle=True, **kwargs):
        super().__init__(**kwargs)
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_mlp = self.mlp_data[batch_indices]
        batch_gnn = self.gnn_data[batch_indices, :]
        batch_y = self.y[batch_indices]
        
        return (batch_mlp, batch_gnn), batch_y

# ==================== 4. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined[['Long','Lat']].values
coords_test = test_orig[['Long','Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
mlp_train = scaler.fit_transform(train_combined[numeric_cols])
mlp_test = scaler.transform(test_orig[numeric_cols])
y_train = train_combined['RI'].values
y_test = test_orig['RI'].values

# ==================== 5. Define GNN-MLP Fusion Model ==================== #
def build_gnn_mlp_model(mlp_dim, gnn_dim):
    # Inputs for all branches
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    
    # --- MLP Branch ---
    mlp_embedding = Dense(128, activation="relu")(mlp_input)
    mlp_embedding = Dense(64, activation="relu", name="mlp_embedding")(mlp_embedding)

    # --- GNN Branch ---
    gnn_embedding = Dense(128, activation="relu")(gnn_input)
    gnn_embedding = Dense(64, activation="relu", name="gnn_embedding")(gnn_embedding)

    # --- Concatenate Embeddings ---
    combined = Concatenate()([mlp_embedding, gnn_embedding])
    
    # Final dense layers for prediction
    f = Dense(128, activation="relu")(combined)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)

    # Build and compile the model
    model = Model(inputs=[mlp_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def evaluate_model(model, mlp_test, gnn_test_matrix, y_test, return_preds=False):
    """
    Evaluates the model on given data and returns R², RMSE, MAE, SMAPE, and predictions.
    """
    y_pred = model.predict((mlp_test, gnn_test_matrix)).flatten()
    
    if return_preds:
        return y_pred
    else:
        # Calculate R-squared and RMSE
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        # Calculate Mean Absolute Error (MAE)
        mae = mean_absolute_error(y_test, y_pred)
        
        # Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
        # Add a small epsilon to the denominator to avoid division by zero
        denominator = np.abs(y_test) + np.abs(y_pred)
        smape = np.mean(2 * np.abs(y_pred - y_test) / (denominator + 1e-8)) * 100
        
        return r2, rmse, mae, smape

def calculate_permutation_importance(model, mlp_data, gnn_data, y_true):
    """
    Calculates permutation feature importance for the MLP and GNN branches.
    """
    print("\nStarting Permutation Feature Importance Analysis...")
    # Get baseline R² on the unshuffled data
    baseline_r2, _, _, _ = evaluate_model(model, mlp_data, gnn_data, y_true)
    print(f"Baseline R² on test set: {baseline_r2:.4f}")

    importance = {}
    
    # Permute MLP input
    shuffled_mlp_data = mlp_data.copy()
    np.random.shuffle(shuffled_mlp_data)
    shuffled_r2, _, _, _ = evaluate_model(model, shuffled_mlp_data, gnn_data, y_true)
    importance['MLP'] = baseline_r2 - shuffled_r2

    # Permute GNN input
    shuffled_gnn_data = gnn_data.copy()
    np.random.shuffle(shuffled_gnn_data)
    shuffled_r2, _, _, _ = evaluate_model(model, mlp_data, shuffled_gnn_data, y_true)
    importance['GNN'] = baseline_r2 - shuffled_r2

    return importance
        

print("\n" + "="*80)
print(f"Analyzing GNN-MLP Fusion Model")
print("="*80)

batch_size = 4
gnn_input_dim = len(coords_train)
mlp_input_dim = mlp_train.shape[1]

model = build_gnn_mlp_model(mlp_input_dim, gnn_input_dim)
model.summary()

# ==================== 6. Create Data Generators ==================== #
train_generator = DataGenerator(
    mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    batch_size=batch_size, shuffle=True
)

# ==================== 7. Train Model ==================== #
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

history = model.fit(
    train_generator,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping],
    validation_data=train_generator
)

# ==================== 8. Evaluate & Perform Feature Importance ==================== #
y_pred_train = model.predict(train_generator).flatten()
r2_train = r2_score(y_train[:len(y_pred_train)], y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train[:len(y_pred_train)], y_pred_train))

mae_train = mean_absolute_error(y_train[:len(y_pred_train)], y_pred_train)
denominator_train = np.abs(y_train[:len(y_pred_train)]) + np.abs(y_pred_train)
smape_train = np.mean(2 * np.abs(y_pred_train - y_train[:len(y_pred_train)]) / (denominator_train + 1e-8)) * 100

# Evaluate on the test data using the updated function
r2_test, rmse_test, mae_test, smape_test = evaluate_model(model, mlp_test, gnn_test, y_test)
y_pred_test = evaluate_model(model, mlp_test, gnn_test, y_test, return_preds=True)

print(f"\n GNN-MLP Fusion Model Performance:")
print(f"R² Train: {r2_train:.4f} | RMSE Train: {rmse_train:.4f} | MAE Train: {mae_train:.4f} | SMAPE Train: {smape_train:.4f}")
print(f"R² Test: {r2_test:.4f} | RMSE Test: {rmse_test:.4f} | MAE Test: {mae_test:.4f} | SMAPE Test: {smape_test:.4f}")

feature_importance = calculate_permutation_importance(model, mlp_test, gnn_test, y_test)
print("\n--- Feature Importance (Permutation) ---")
sorted_importance = sorted(feature_importance.items(), key=lambda item: item[1], reverse=True)
for feature, score in sorted_importance:
    print(f"{feature}: {score:.4f}")


Note: Raster data is not used in this GNN-MLP model.

Analyzing GNN-MLP Fusion Model


Epoch 1/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 34869.8086 - val_loss: 27263.4980
Epoch 2/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 22140.6328 - val_loss: 4379.2148
Epoch 3/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 6086.4219 - val_loss: 3959.1904
Epoch 4/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 4269.3799 - val_loss: 2795.8623
Epoch 5/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 3444.9170 - val_loss: 1546.6180
Epoch 6/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1516.3749 - val_loss: 739.7513
Epoch 7/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 984.8546 - val_loss: 492.2373
Epoch 8/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 1120.9398 - val_loss: 432.6407


#### GNN MLP is the fastest runned code

In [2]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout, Layer, MultiHeadAttention, LayerNormalization, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
from sklearn.ensemble import GradientBoostingRegressor # For intrinsic importance
import lime # For LIME importance
from lime import lime_tabular # For LIME importance
import gc
import sys
from io import StringIO
import pickle

# Define the single buffer size to use
BUFFER_METERS = 500

# ==================== 1. Load Data ==================== #
orig = pd.read_csv("../../data/RainySeason.csv")
river_100 = pd.read_csv("../data/Samples_100.csv")

drop_cols = ['Stations','River','Lat','Long','geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')

# Train-test split
train_orig = orig.sample(10, random_state=42)
test_orig = orig.drop(train_orig.index)
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
# We are not using rasters in this GNN-MLP model, but the paths are still
# defined for consistency with previous versions.
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDW/*.tif")

print("Note: Raster data is not used in this GNN-MLP model.")

# ==================== 3. Create a Custom Data Generator ==================== #
class DataGenerator(Sequence):
    def __init__(self, mlp_data, gnn_data, y, batch_size=4, shuffle=True, **kwargs):
        super().__init__(**kwargs)
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_mlp = self.mlp_data[batch_indices]
        batch_gnn = self.gnn_data[batch_indices, :]
        batch_y = self.y[batch_indices]
        
        return (batch_mlp, batch_gnn), batch_y

# ==================== 4. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined[['Long','Lat']].values
coords_test = test_orig[['Long','Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
mlp_train = scaler.fit_transform(train_combined[numeric_cols])
mlp_test = scaler.transform(test_orig[numeric_cols])
y_train = train_combined['RI'].values
y_test = test_orig['RI'].values

# ==================== 5. Define GNN-MLP Fusion Model ==================== #
def build_gnn_mlp_model(mlp_dim, gnn_dim):
    # Inputs for all branches
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    
    # --- MLP Branch ---
    mlp_embedding = Dense(128, activation="relu")(mlp_input)
    mlp_embedding = Dense(64, activation="relu", name="mlp_embedding")(mlp_embedding)

    # --- GNN Branch ---
    gnn_embedding = Dense(128, activation="relu")(gnn_input)
    gnn_embedding = Dense(64, activation="relu", name="gnn_embedding")(gnn_embedding)

    # --- Concatenate Embeddings ---
    combined = Concatenate()([mlp_embedding, gnn_embedding])
    
    # Final dense layers for prediction
    f = Dense(128, activation="relu")(combined)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)

    # Build and compile the model
    model = Model(inputs=[mlp_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def evaluate_model(model, mlp_test, gnn_test_matrix, y_test, return_preds=False):
    """
    Evaluates the model on given data and returns R², RMSE, MAE, SMAPE, and predictions.
    """
    y_pred = model.predict((mlp_test, gnn_test_matrix)).flatten()
    
    if return_preds:
        return y_pred
    else:
        # Calculate R-squared and RMSE
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        # Calculate Mean Absolute Error (MAE)
        mae = mean_absolute_error(y_test, y_pred)
        
        # Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
        # Add a small epsilon to the denominator to avoid division by zero
        denominator = np.abs(y_test) + np.abs(y_pred)
        smape = np.mean(2 * np.abs(y_pred - y_test) / (denominator + 1e-8)) * 100
        
        return r2, rmse, mae, smape

def calculate_permutation_importance(model, mlp_data, gnn_data, y_true):
    """
    Calculates permutation feature importance for the MLP and GNN branches.
    """
    print("\nStarting Permutation Feature Importance Analysis...")
    # Get baseline R² on the unshuffled data
    baseline_r2, _, _, _ = evaluate_model(model, mlp_data, gnn_data, y_true)
    print(f"Baseline R² on test set: {baseline_r2:.4f}")

    importance = {}
    
    # Permute MLP input
    shuffled_mlp_data = mlp_data.copy()
    np.random.shuffle(shuffled_mlp_data)
    shuffled_r2, _, _, _ = evaluate_model(model, shuffled_mlp_data, gnn_data, y_true)
    importance['MLP'] = baseline_r2 - shuffled_r2

    # Permute GNN input
    shuffled_gnn_data = gnn_data.copy()
    np.random.shuffle(shuffled_gnn_data)
    shuffled_r2, _, _, _ = evaluate_model(model, mlp_data, shuffled_gnn_data, y_true)
    importance['GNN'] = baseline_r2 - shuffled_r2

    return importance

def calculate_intrinsic_importance(mlp_data, y_true, feature_names):
    """
    Calculates intrinsic feature importance using a tree-based model.
    """
    print("\nTraining a Gradient Boosting Regressor on the MLP features...")
    gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gbr.fit(mlp_data, y_true)
    
    importance = {}
    for i, name in enumerate(feature_names):
        importance[name] = gbr.feature_importances_[i]
    return importance

def get_lime_explanation(model, mlp_data, gnn_data, feature_names, sample_index):
    """
    Generates a LIME explanation for a single data point.
    """
    print("\nGenerating LIME explanation for a single data point...")
    
    # Create a wrapper prediction function for LIME
    # The wrapper takes only the MLP data and combines it with the GNN data
    # for the main model's prediction.
    def predict_fn(x):
        # We need to reshape x to be a 2D array if it's not already
        if x.ndim == 1:
            x = x.reshape(1, -1)
        # Create the full input for the GNN-MLP model
        gnn_input_for_lime = np.tile(gnn_data[sample_index:sample_index+1, :], (x.shape[0], 1))
        # Predict using the full model
        return model.predict([x, gnn_input_for_lime])
    
    # Initialize the LIME explainer
    explainer = lime_tabular.LimeTabularExplainer(
        training_data=mlp_data,
        feature_names=list(feature_names),
        mode='regression',
        verbose=False
    )
    
    # Explain the selected instance
    explanation = explainer.explain_instance(
        data_row=mlp_data[sample_index],
        predict_fn=predict_fn,
        num_features=len(feature_names),
        num_samples=5000 # Increase samples for better stability
    )
    
    # Extract the feature weights
    lime_weights = explanation.as_list()
    return lime_weights


# ==================== Run the Analysis ==================== #

print("\n" + "="*80)
print(f"Analyzing GNN-MLP Fusion Model")
print("="*80)

batch_size = 4
gnn_input_dim = len(coords_train)
mlp_input_dim = mlp_train.shape[1]

model = build_gnn_mlp_model(mlp_input_dim, gnn_input_dim)

# ==================== 6. Create Data Generators ==================== #
train_generator = DataGenerator(
    mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    batch_size=batch_size, shuffle=True
)

# ==================== 7. Train Model ==================== #
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

history = model.fit(
    train_generator,
    epochs=100,
    verbose=0,
    callbacks=[early_stopping],
    validation_data=train_generator
)

# ==================== 8. Evaluate & Perform Feature Importance ==================== #
y_pred_train = model.predict(train_generator).flatten()
r2_train = r2_score(y_train[:len(y_pred_train)], y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train[:len(y_pred_train)], y_pred_train))

mae_train = mean_absolute_error(y_train[:len(y_pred_train)], y_pred_train)
denominator_train = np.abs(y_train[:len(y_pred_train)]) + np.abs(y_pred_train)
smape_train = np.mean(2 * np.abs(y_pred_train - y_train[:len(y_pred_train)]) / (denominator_train + 1e-8)) * 100

# Evaluate on the test data using the updated function
r2_test, rmse_test, mae_test, smape_test = evaluate_model(model, mlp_test, gnn_test, y_test)
y_pred_test = evaluate_model(model, mlp_test, gnn_test, y_test, return_preds=True)

print(f"\n GNN-MLP Fusion Model Performance:")
print(f"R² Train: {r2_train:.4f} | RMSE Train: {rmse_train:.4f} | MAE Train: {mae_train:.4f} | SMAPE Train: {smape_train:.4f}")
print(f"R² Test: {r2_test:.4f} | RMSE Test: {rmse_test:.4f} | MAE Test: {mae_test:.4f} | SMAPE Test: {smape_test:.4f}")


# Calculate and print feature importance
print("\n--- Permutation-based Feature Importance ---")
feature_importance_perm = calculate_permutation_importance(model, mlp_test, gnn_test, y_test)
sorted_importance_perm = sorted(feature_importance_perm.items(), key=lambda item: item[1], reverse=True)
for feature, score in sorted_importance_perm:
    print(f"{feature}: {score:.4f}")

print("\n--- Intrinsic Feature Importance (Gradient Boosting) ---")
feature_importance_intrinsic = calculate_intrinsic_importance(mlp_train, y_train, numeric_cols)
sorted_importance_intrinsic = sorted(feature_importance_intrinsic.items(), key=lambda item: item[1], reverse=True)
for feature, score in sorted_importance_intrinsic:
    print(f"{feature}: {score:.4f}")
    
# Get LIME explanation for a single test point
lime_sample_index = np.random.randint(0, len(mlp_test))
print(f"\n--- LIME Explanation for a single test point (index {lime_sample_index}) ---")
lime_explanation = get_lime_explanation(model, mlp_test, gnn_test, numeric_cols, lime_sample_index)
for feature, weight in lime_explanation:
    print(f"Feature: {feature} | Weight: {weight:.4f}")


Note: Raster data is not used in this GNN-MLP model.

Analyzing GNN-MLP Fusion Model
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 543us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step

 GNN-MLP Fusion Model Performance:
R² Train: -1.1912 | RMSE Train: 101.0737 | MAE Train: 79.0743 | SMAPE Train: 42.1135
R² Test: 0.9687 | RMSE Test: 13.9777 | MAE Test: 11.3563 | SMAPE Test: 7.1694

--- Permutation-based Feature Importance ---

Starting Permutation Feature Importance Analysis...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Baseline R² on test set: 0.9687
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
MLP: 1.7868
GNN: 0.0019

--- Intrinsic Feature Importance (Gradient Boosting) ---

Training a Gradient Boosting Regressor on the MLP features...
PbR: 

In [4]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout, Layer, MultiHeadAttention, LayerNormalization, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
from sklearn.ensemble import GradientBoostingRegressor # For intrinsic importance
import lime # For LIME importance
from lime import lime_tabular # For LIME importance
import gc
import sys
from io import StringIO
import pickle

# Define the single buffer size to use
BUFFER_METERS = 500

# ==================== 1. Load Data ==================== #
# We assume the data files are in the correct relative paths.
try:
    orig = pd.read_csv("../../data/RainySeason.csv")
    river_100 = pd.read_csv("../data/Samples_100.csv")
except FileNotFoundError as e:
    print(f"Error: Required data file not found. Please ensure the files are at the correct paths.")
    print(f"Details: {e}")
    sys.exit()

drop_cols = ['Stations','River','Lat','Long','geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI').tolist()

# Train-test split
train_orig = orig.sample(10, random_state=42)
test_orig = orig.drop(train_orig.index)
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
raster_paths = []
# Use os.path.join for robust path handling
base_dir = "../"
raster_paths += glob.glob(os.path.join(base_dir, "CalIndices", "*.tif"))
raster_paths += glob.glob(os.path.join(base_dir, "LULCMerged", "*.tif"))
raster_paths += glob.glob(os.path.join(base_dir, "IDW", "*.tif"))

print("Note: Raster data will now be integrated into the MLP input.")
print(f"Found {len(raster_paths)} raster files.")

# ==================== 3. Function to extract raster values using a buffer ==================== #
def sample_rasters_with_buffer(df, raster_paths, buffer_meters):
    """
    Extracts raster values by calculating the mean within a circular buffer
    around each (Lat, Long) point.

    Args:
        df (pd.DataFrame): DataFrame with 'Lat' and 'Long' columns.
        raster_paths (list): List of paths to raster files.
        buffer_meters (int): The radius of the buffer in meters.

    Returns:
        pd.DataFrame: DataFrame with a new column for each raster's aggregated value.
    """
    raster_values = pd.DataFrame(index=df.index)
    
    for path in raster_paths:
        try:
            with rasterio.open(path) as src:
                # Get the pixel size (resolution)
                pixel_size_x, pixel_size_y = src.res
                
                # Convert the buffer in meters to pixels
                buffer_pixels_x = int(np.ceil(buffer_meters / pixel_size_x))
                buffer_pixels_y = int(np.ceil(buffer_meters / pixel_size_y))
                
                aggregated_data = []
                for _, row in df.iterrows():
                    long, lat = row['Long'], row['Lat']
                    
                    # Convert coordinates to raster indices (row, col)
                    row_idx, col_idx = src.index(long, lat)
                    
                    # Define the window to read from the raster
                    window = Window(
                        col_off=col_idx - buffer_pixels_x,
                        row_off=row_idx - buffer_pixels_y,
                        width=2 * buffer_pixels_x + 1,
                        height=2 * buffer_pixels_y + 1
                    )
                    
                    # Read the data from the defined window
                    try:
                        data = src.read(1, window=window)
                        # Calculate the mean of the pixels within the window
                        # Use np.nanmean to handle NoData values
                        aggregated_value = np.nanmean(data)
                        aggregated_data.append(aggregated_value)
                    except rasterio.errors.WindowError:
                        # If the window goes outside the raster bounds, append NaN
                        aggregated_data.append(np.nan)
                
                # Get a clean name for the new column
                raster_name = os.path.splitext(os.path.basename(path))[0]
                raster_values[raster_name] = aggregated_data

        except rasterio.RasterioIOError:
            print(f"Warning: Could not open or read raster file at {path}. Skipping.")
        except Exception as e:
            print(f"An error occurred while processing {path}: {e}")
            
    return raster_values

# ==================== 4. Extract Raster Data and Merge with Main Data ==================== #
# Extract raster data for both training and testing sets
train_raster_data = sample_rasters_with_buffer(train_combined, raster_paths, BUFFER_METERS)
test_raster_data = sample_rasters_with_buffer(test_orig, raster_paths, BUFFER_METERS)

# Now, we combine the original numeric features with the new raster features
# Note: We must handle NaNs, as they can occur if a point is outside a raster's bounds.
# A simple fillna(0) is used here, but a more sophisticated imputation might be needed.
train_combined_with_rasters = pd.concat([train_combined, train_raster_data], axis=1).fillna(0)
test_orig_with_rasters = pd.concat([test_orig, test_raster_data], axis=1).fillna(0)

# Update the list of numeric columns to include the new raster features
raster_cols = train_raster_data.columns.tolist()
all_numeric_cols = numeric_cols + raster_cols

# ==================== 5. Create a Custom Data Generator ==================== #
class DataGenerator(Sequence):
    def __init__(self, mlp_data, gnn_data, y, batch_size=4, shuffle=True, **kwargs):
        super().__init__(**kwargs)
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_mlp = self.mlp_data[batch_indices]
        batch_gnn = self.gnn_data[batch_indices, :]
        batch_y = self.y[batch_indices]
        
        return (batch_mlp, batch_gnn), batch_y

# ==================== 6. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined_with_rasters[['Long','Lat']].values
coords_test = test_orig_with_rasters[['Long','Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
# The scaler now fits on the combined original + raster data
mlp_train = scaler.fit_transform(train_combined_with_rasters[all_numeric_cols])
mlp_test = scaler.transform(test_orig_with_rasters[all_numeric_cols])
y_train = train_combined_with_rasters['RI'].values
y_test = test_orig_with_rasters['RI'].values

# ==================== 7. Define GNN-MLP Fusion Model ==================== #
def build_gnn_mlp_model(mlp_dim, gnn_dim):
    # Inputs for all branches
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    
    # --- MLP Branch ---
    mlp_embedding = Dense(128, activation="relu")(mlp_input)
    mlp_embedding = Dense(64, activation="relu", name="mlp_embedding")(mlp_embedding)

    # --- GNN Branch ---
    gnn_embedding = Dense(128, activation="relu")(gnn_input)
    gnn_embedding = Dense(64, activation="relu", name="gnn_embedding")(gnn_embedding)

    # --- Concatenate Embeddings ---
    combined = Concatenate()([mlp_embedding, gnn_embedding])
    
    # Final dense layers for prediction
    f = Dense(128, activation="relu")(combined)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)

    # Build and compile the model
    model = Model(inputs=[mlp_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def evaluate_model(model, mlp_test, gnn_test_matrix, y_test, return_preds=False):
    """
    Evaluates the model on given data and returns R², RMSE, MAE, SMAPE, and predictions.
    """
    y_pred = model.predict((mlp_test, gnn_test_matrix)).flatten()
    
    if return_preds:
        return y_pred
    else:
        # Calculate R-squared and RMSE
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        # Calculate Mean Absolute Error (MAE)
        mae = mean_absolute_error(y_test, y_pred)
        
        # Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
        # Add a small epsilon to the denominator to avoid division by zero
        denominator = np.abs(y_test) + np.abs(y_pred)
        smape = np.mean(2 * np.abs(y_pred - y_test) / (denominator + 1e-8)) * 100
        
        return r2, rmse, mae, smape

def calculate_permutation_importance(model, mlp_data, gnn_data, y_true):
    """
    Calculates permutation feature importance for the MLP and GNN branches.
    """
    print("\nStarting Permutation Feature Importance Analysis...")
    # Get baseline R² on the unshuffled data
    baseline_r2, _, _, _ = evaluate_model(model, mlp_data, gnn_data, y_true)
    print(f"Baseline R² on test set: {baseline_r2:.4f}")

    importance = {}
    
    # Permute MLP input
    shuffled_mlp_data = mlp_data.copy()
    np.random.shuffle(shuffled_mlp_data)
    shuffled_r2, _, _, _ = evaluate_model(model, shuffled_mlp_data, gnn_data, y_true)
    importance['MLP'] = baseline_r2 - shuffled_r2

    # Permute GNN input
    shuffled_gnn_data = gnn_data.copy()
    np.random.shuffle(shuffled_gnn_data)
    shuffled_r2, _, _, _ = evaluate_model(model, mlp_data, shuffled_gnn_data, y_true)
    importance['GNN'] = baseline_r2 - shuffled_r2

    return importance

def calculate_intrinsic_importance(mlp_data, y_true, feature_names):
    """
    Calculates intrinsic feature importance using a tree-based model.
    """
    print("\nTraining a Gradient Boosting Regressor on the MLP features...")
    gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gbr.fit(mlp_data, y_true)
    
    importance = {}
    for i, name in enumerate(feature_names):
        importance[name] = gbr.feature_importances_[i]
    return importance

def get_lime_explanation(model, mlp_data, gnn_data, feature_names, sample_index):
    """
    Generates a LIME explanation for a single data point.
    """
    print("\nGenerating LIME explanation for a single data point...")
    
    # Create a wrapper prediction function for LIME
    # The wrapper takes only the MLP data and combines it with the GNN data
    # for the main model's prediction.
    def predict_fn(x):
        # We need to reshape x to be a 2D array if it's not already
        if x.ndim == 1:
            x = x.reshape(1, -1)
        # Create the full input for the GNN-MLP model
        gnn_input_for_lime = np.tile(gnn_data[sample_index:sample_index+1, :], (x.shape[0], 1))
        # Predict using the full model
        return model.predict([x, gnn_input_for_lime])
    
    # Initialize the LIME explainer
    explainer = lime_tabular.LimeTabularExplainer(
        training_data=mlp_data,
        feature_names=list(feature_names),
        mode='regression',
        verbose=False
    )
    
    # Explain the selected instance
    explanation = explainer.explain_instance(
        data_row=mlp_data[sample_index],
        predict_fn=predict_fn,
        num_features=len(feature_names),
        num_samples=5000 # Increase samples for better stability
    )
    
    # Extract the feature weights
    lime_weights = explanation.as_list()
    return lime_weights


# ==================== Run the Analysis ==================== #

print("\n" + "="*80)
print(f"Analyzing GNN-MLP Fusion Model with Raster Data")
print("="*80)

batch_size = 4
gnn_input_dim = len(coords_train)
mlp_input_dim = mlp_train.shape[1]

model = build_gnn_mlp_model(mlp_input_dim, gnn_input_dim)

# ==================== 8. Create Data Generators ==================== #
train_generator = DataGenerator(
    mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    batch_size=batch_size, shuffle=True
)

# ==================== 9. Train Model ==================== #
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

history = model.fit(
    train_generator,
    epochs=100,
    verbose=1,
    callbacks=[early_stopping],
    validation_data=train_generator
)

# ==================== 10. Evaluate & Perform Feature Importance ==================== #
# Evaluate on the training data using the updated function
y_pred_train = model.predict(train_generator).flatten()
r2_train = r2_score(y_train[:len(y_pred_train)], y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train[:len(y_pred_train)], y_pred_train))
mae_train = mean_absolute_error(y_train[:len(y_pred_train)], y_pred_train)
denominator_train = np.abs(y_train[:len(y_pred_train)]) + np.abs(y_pred_train)
smape_train = np.mean(2 * np.abs(y_pred_train - y_train[:len(y_pred_train)]) / (denominator_train + 1e-8)) * 100

# Evaluate on the test data using the updated function
r2_test, rmse_test, mae_test, smape_test = evaluate_model(model, mlp_test, gnn_test, y_test)
y_pred_test = evaluate_model(model, mlp_test, gnn_test, y_test, return_preds=True)

print(f"\n GNN-MLP Fusion Model Performance:")
print(f"R² Train: {r2_train:.4f} | RMSE Train: {rmse_train:.4f} | MAE Train: {mae_train:.4f} | SMAPE Train: {smape_train:.4f}")
print(f"R² Test: {r2_test:.4f} | RMSE Test: {rmse_test:.4f} | MAE Test: {mae_test:.4f} | SMAPE Test: {smape_test:.4f}")


Note: Raster data will now be integrated into the MLP input.
Found 26 raster files.


  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



Analyzing GNN-MLP Fusion Model with Raster Data
Epoch 1/100


  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 33249.9336 - val_loss: 30478.7344
Epoch 2/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 27342.2598 - val_loss: 8082.3784
Epoch 3/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 6094.9326 - val_loss: 3694.0234
Epoch 4/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3800.7034 - val_loss: 2338.7400
Epoch 5/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2502.9082 - val_loss: 1227.0741
Epoch 6/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1752.1050 - val_loss: 657.3976
Epoch 7/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1242.1130 - val_loss: 570.2490
Epoch 8/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1030.2278 - val_loss: 397.0659
Epoch 9/100


In [9]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout, Layer, MultiHeadAttention, LayerNormalization, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
from sklearn.ensemble import GradientBoostingRegressor # For intrinsic importance
import lime # For LIME importance
from lime import lime_tabular # For LIME importance
import gc
import sys
from io import StringIO
import pickle

# Define the single buffer size to use
BUFFER_METERS = 500

# ==================== 1. Load Data ==================== #
# We assume the data files are in the correct relative paths.
try:
    orig = pd.read_csv("../../data/RainySeason.csv")
    river_100 = pd.read_csv("../data/Samples_100.csv")
except FileNotFoundError as e:
    print(f"Error: Required data file not found. Please ensure the files are at the correct paths.")
    print(f"Details: {e}")
    sys.exit()

drop_cols = ['Stations','River','Lat','Long','geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI').tolist()

# Train-test split
train_orig = orig.sample(10, random_state=42)
test_orig = orig.drop(train_orig.index)
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
raster_paths = []
# Use os.path.join for robust path handling
base_dir = "../"
raster_paths += glob.glob(os.path.join(base_dir, "CalIndices", "*.tif"))
raster_paths += glob.glob(os.path.join(base_dir, "LULCMerged", "*.tif"))
raster_paths += glob.glob(os.path.join(base_dir, "IDW", "*.tif"))

print("Note: Raster data will now be integrated into the MLP input.")
print(f"Found {len(raster_paths)} raster files.")

# ==================== 3. Function to extract raster values using a buffer ==================== #
def sample_rasters_with_buffer(df, raster_paths, buffer_meters):
    """
    Extracts raster values by calculating the mean within a circular buffer
    around each (Lat, Long) point.

    Args:
        df (pd.DataFrame): DataFrame with 'Lat' and 'Long' columns.
        raster_paths (list): List of paths to raster files.
        buffer_meters (int): The radius of the buffer in meters.

    Returns:
        pd.DataFrame: DataFrame with a new column for each raster's aggregated value.
    """
    raster_values = pd.DataFrame(index=df.index)
    
    for path in raster_paths:
        try:
            with rasterio.open(path) as src:
                # Get the pixel size (resolution)
                pixel_size_x, pixel_size_y = src.res
                
                # Convert the buffer in meters to pixels
                buffer_pixels_x = int(np.ceil(buffer_meters / pixel_size_x))
                buffer_pixels_y = int(np.ceil(buffer_meters / pixel_size_y))
                
                aggregated_data = []
                for _, row in df.iterrows():
                    long, lat = row['Long'], row['Lat']
                    
                    # Convert coordinates to raster indices (row, col)
                    row_idx, col_idx = src.index(long, lat)
                    
                    # Define the window to read from the raster
                    window = Window(
                        col_off=col_idx - buffer_pixels_x,
                        row_off=row_idx - buffer_pixels_y,
                        width=2 * buffer_pixels_x + 1,
                        height=2 * buffer_pixels_y + 1
                    )
                    
                    # Read the data from the defined window
                    try:
                        data = src.read(1, window=window)
                        # Calculate the mean of the pixels within the window
                        # Use np.nanmean to handle NoData values
                        aggregated_value = np.nanmean(data)
                        aggregated_data.append(aggregated_value)
                    except rasterio.errors.WindowError:
                        # If the window goes outside the raster bounds, append NaN
                        aggregated_data.append(np.nan)
                
                # Get a clean name for the new column
                raster_name = os.path.splitext(os.path.basename(path))[0]
                raster_values[raster_name] = aggregated_data

        except rasterio.RasterioIOError:
            print(f"Warning: Could not open or read raster file at {path}. Skipping.")
        except Exception as e:
            print(f"An error occurred while processing {path}: {e}")
            
    return raster_values

# ==================== 4. Extract Raster Data and Merge with Main Data ==================== #
# Extract raster data for both training and testing sets
train_raster_data = sample_rasters_with_buffer(train_combined, raster_paths, BUFFER_METERS)
test_raster_data = sample_rasters_with_buffer(test_orig, raster_paths, BUFFER_METERS)

# Now, we combine the original numeric features with the new raster features
# Note: We must handle NaNs, as they can occur if a point is outside a raster's bounds.
# A simple fillna(0) is used here, but a more sophisticated imputation might be needed.
train_combined_with_rasters = pd.concat([train_combined, train_raster_data], axis=1).fillna(0)
test_orig_with_rasters = pd.concat([test_orig, test_raster_data], axis=1).fillna(0)

# Update the list of numeric columns to include the new raster features
raster_cols = train_raster_data.columns.tolist()
all_numeric_cols = numeric_cols + raster_cols

# ==================== 5. Create a Custom Data Generator ==================== #
class DataGenerator(Sequence):
    def __init__(self, mlp_data, gnn_data, y, batch_size=4, shuffle=True, **kwargs):
        super().__init__(**kwargs)
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_mlp = self.mlp_data[batch_indices]
        batch_gnn = self.gnn_data[batch_indices, :]
        batch_y = self.y[batch_indices]
        
        return (batch_mlp, batch_gnn), batch_y

# ==================== 6. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined_with_rasters[['Long','Lat']].values
coords_test = test_orig_with_rasters[['Long','Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
# The scaler now fits on the combined original + raster data
mlp_train = scaler.fit_transform(train_combined_with_rasters[all_numeric_cols])
mlp_test = scaler.transform(test_orig_with_rasters[all_numeric_cols])
y_train = train_combined_with_rasters['RI'].values
y_test = test_orig_with_rasters['RI'].values

# ==================== 7. Define GNN-MLP Fusion Model ==================== #
def build_gnn_mlp_model(mlp_dim, gnn_dim):
    # Inputs for all branches
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    
    # --- MLP Branch ---
    mlp_embedding = Dense(128, activation="relu")(mlp_input)
    mlp_embedding = Dense(64, activation="relu", name="mlp_embedding")(mlp_embedding)

    # --- GNN Branch ---
    gnn_embedding = Dense(128, activation="relu")(gnn_input)
    gnn_embedding = Dense(64, activation="relu", name="gnn_embedding")(gnn_embedding)

    # --- Concatenate Embeddings ---
    combined = Concatenate()([mlp_embedding, gnn_embedding])
    
    # Final dense layers for prediction
    f = Dense(128, activation="relu")(combined)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)

    # Build and compile the model
    model = Model(inputs=[mlp_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def evaluate_model(model, mlp_test, gnn_test_matrix, y_test, return_preds=False):
    """
    Evaluates the model on given data and returns R², RMSE, MAE, SMAPE, and predictions.
    """
    y_pred = model.predict((mlp_test, gnn_test_matrix)).flatten()
    
    if return_preds:
        return y_pred
    else:
        # Calculate R-squared and RMSE
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        # Calculate Mean Absolute Error (MAE)
        mae = mean_absolute_error(y_test, y_pred)
        
        # Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
        # Add a small epsilon to the denominator to avoid division by zero
        denominator = np.abs(y_test) + np.abs(y_pred)
        smape = np.mean(2 * np.abs(y_pred - y_test) / (denominator + 1e-8)) * 100
        
        return r2, rmse, mae, smape

def calculate_permutation_importance(model, mlp_data, gnn_data, y_true, feature_names):
    """
    Calculates permutation feature importance for the MLP and GNN branches.
    Now also includes importance for individual features.
    """
    print("\nStarting Permutation Feature Importance Analysis...")
    # Get baseline R² on the unshuffled data
    baseline_r2, _, _, _ = evaluate_model(model, mlp_data, gnn_data, y_true)
    print(f"Baseline R² on test set: {baseline_r2:.4f}")

    importance = {}
    
    # Permute MLP input as a whole
    shuffled_mlp_data = mlp_data.copy()
    np.random.shuffle(shuffled_mlp_data)
    shuffled_r2, _, _, _ = evaluate_model(model, shuffled_mlp_data, gnn_data, y_true)
    importance['MLP'] = baseline_r2 - shuffled_r2
    
    # Permute GNN input
    shuffled_gnn_data = gnn_data.copy()
    np.random.shuffle(shuffled_gnn_data)
    shuffled_r2, _, _, _ = evaluate_model(model, mlp_data, shuffled_gnn_data, y_true)
    importance['GNN'] = baseline_r2 - shuffled_r2

    # Permute each individual feature in MLP input
    print("\nCalculating individual feature importance...")
    for i, feature in enumerate(feature_names):
        shuffled_data_mlp = mlp_data.copy()
        np.random.shuffle(shuffled_data_mlp[:, i])
        shuffled_r2, _, _, _ = evaluate_model(model, shuffled_data_mlp, gnn_data, y_true)
        importance[feature] = baseline_r2 - shuffled_r2

    return importance

def calculate_intrinsic_importance(mlp_data, y_true, feature_names):
    """
    Calculates intrinsic feature importance using a tree-based model.
    """
    print("\nTraining a Gradient Boosting Regressor on the MLP features...")
    gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gbr.fit(mlp_data, y_true)
    
    importance = {}
    for i, name in enumerate(feature_names):
        importance[name] = gbr.feature_importances_[i]
    return importance

def get_lime_explanation(model, mlp_test, gnn_test, train_df_unscaled, sample_index):
    """
    Generates a LIME explanation for a single data point.
    """
    print("\nGenerating LIME explanation for a single data point...")
    
    # Create a wrapper prediction function for LIME
    def predict_fn(x):
        # We need to reshape x to be a 2D array if it's not already
        if x.ndim == 1:
            x = x.reshape(1, -1)
        # Create the full input for the GNN-MLP model
        gnn_input_for_lime = np.tile(gnn_test[sample_index:sample_index+1, :], (x.shape[0], 1))
        # Predict using the full model
        return model.predict([x, gnn_input_for_lime])
    
    # Initialize the LIME explainer with the training data
    explainer = lime_tabular.LimeTabularExplainer(
        # Use the unscaled training data and its columns for a robust feature name link
        training_data=train_df_unscaled.values,
        feature_names=train_df_unscaled.columns.tolist(),
        mode='regression',
        verbose=False
    )
    
    # Explain the selected instance from the SCALED test data
    explanation = explainer.explain_instance(
        data_row=mlp_test[sample_index],
        predict_fn=predict_fn,
        num_features=len(train_df_unscaled.columns),
        num_samples=5000 # Increase samples for better stability
    )
    
    # Extract the feature weights
    lime_weights = explanation.as_list()
    return lime_weights


# ==================== Run the Analysis ==================== #

print("\n" + "="*80)
print(f"Analyzing GNN-MLP Fusion Model with Raster Data")
print("="*80)

batch_size = 4
gnn_input_dim = len(coords_train)
mlp_input_dim = mlp_train.shape[1]

model = build_gnn_mlp_model(mlp_input_dim, gnn_input_dim)

# ==================== 8. Create Data Generators ==================== #
train_generator = DataGenerator(
    mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    batch_size=batch_size, shuffle=True
)

# ==================== 9. Train Model ==================== #
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

history = model.fit(
    train_generator,
    epochs=100,
    verbose=0,
    callbacks=[early_stopping],
    validation_data=train_generator
)

# ==================== 10. Evaluate & Perform Feature Importance ==================== #
# Evaluate on the training data using the updated function
y_pred_train = model.predict(train_generator).flatten()
r2_train = r2_score(y_train[:len(y_pred_train)], y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train[:len(y_pred_train)], y_pred_train))
mae_train = mean_absolute_error(y_train[:len(y_pred_train)], y_pred_train)
denominator_train = np.abs(y_train[:len(y_pred_train)]) + np.abs(y_pred_train)
smape_train = np.mean(2 * np.abs(y_pred_train - y_train[:len(y_pred_train)]) / (denominator_train + 1e-8)) * 100

# Evaluate on the test data using the updated function
r2_test, rmse_test, mae_test, smape_test = evaluate_model(model, mlp_test, gnn_test, y_test)
y_pred_test = evaluate_model(model, mlp_test, gnn_test, y_test, return_preds=True)

print(f"\n GNN-MLP Fusion Model Performance:")
print(f"R² Train: {r2_train:.4f} | RMSE Train: {rmse_train:.4f} | MAE Train: {mae_train:.4f} | SMAPE Train: {smape_train:.4f}")
print(f"R² Test: {r2_test:.4f} | RMSE Test: {rmse_test:.4f} | MAE Test: {mae_test:.4f} | SMAPE Test: {smape_test:.4f}")


# Calculate and print feature importance
print("\n--- Permutation-based Feature Importance ---")
feature_importance_perm = calculate_permutation_importance(
    model, 
    mlp_test, 
    gnn_test, 
    y_test,
    all_numeric_cols
)
sorted_importance_perm = sorted(
    feature_importance_perm.items(), 
    key=lambda item: item[1], 
    reverse=True
)
for feature, score in sorted_importance_perm:
    print(f"{feature}: {score:.4f}")



Note: Raster data will now be integrated into the MLP input.
Found 26 raster files.


  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  aggregated_value = np.nanmean(data)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  aggregated_value = np.nanmean(data)
  aggregated_val


Analyzing GNN-MLP Fusion Model with Raster Data
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 546us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step

 GNN-MLP Fusion Model Performance:
R² Train: -1.0381 | RMSE Train: 97.4803 | MAE Train: 74.9346 | SMAPE Train: 39.7962
R² Test: 0.9499 | RMSE Test: 17.7042 | MAE Test: 15.9653 | SMAPE Test: 10.1219

--- Permutation-based Feature Importance ---

Starting Permutation Feature Importance Analysis...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Baseline R² on test set: 0.9499
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step

Calculating individual feature importance...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/s

In [23]:
for feature, score in sorted_importance_perm[:10]:
    print(f"{feature}: {score:.15f}")

feature_importance_intrinsic = calculate_intrinsic_importance(mlp_train, y_train, all_numeric_cols)
sorted_importance_intrinsic = sorted(feature_importance_intrinsic.items(), key=lambda item: item[1], reverse=True)
for feature, score in sorted_importance_intrinsic[:10]:
    print(f"{feature}: {score:.4f}")
    
for feature, weight in lime_explanation:
    print(f"Feature: {feature} | Weight: {weight:.4f}")


MLP: 2.111869773783312
FeR: 0.079347831991002
num_industry: 0.045576858048925
awei: 0.044838264154239
MR: 0.018977220566508
num_brick_field: 0.014194593213590
bui: 0.007192163417536
ndwi: 0.006094875653798
ndsi: 0.005192015143415
GNN: 0.000331639310161

Training a Gradient Boosting Regressor on the MLP features...
FeR: 0.8536
MR: 0.0093
ndsi: 0.0012
hydro_dist_ind: 0.0012
ndbsi: 0.0006
awei: 0.0005
bui: 0.0004
ndwi: 0.0001
hydro_dist_brick: 0.0001
num_industry: 0.0000
Feature: FeR <= 28222.33 | Weight: -207221.8520
Feature: hydro_dist_ind <= 509.29 | Weight: -27457.3500
Feature: CuR > -4240.00 | Weight: -26989.6129
Feature: SandR > -4266.97 | Weight: -26989.6129
Feature: SiltR > -4262.00 | Weight: -26989.6129
Feature: AsR > -4276.67 | Weight: -26989.6129
Feature: ClayR > -4266.17 | Weight: -26989.6129
Feature: ClayR > -4266.17 | Weight: -26989.6129
Feature: CrR > -4255.07 | Weight: -26989.6129
Feature: CrR > -4255.07 | Weight: -26989.6129
Feature: NiR > -4265.75 | Weight: -26989.6129
F

In [25]:
pd.read_csv("../../data/WinterSeason1.csv").columns


Index(['Stations', 'River', 'Lat', 'Long', 'geometry', 'hydro_dist_brick',
       'num_brick_field', 'hydro_dist_ind', 'num_industry', 'CrW', 'NiW',
       'CuW', 'AsW', 'CdW', 'PbW', 'MW', 'SandW', 'SiltW', 'ClayW', 'FeW',
       'RI'],
      dtype='object')