In [3]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, ConvLSTM2D, Flatten, Dense, Concatenate, Dropout, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import gc # Import garbage collector
import sys
from io import StringIO
import pickle

# Define the single buffer size to use
BUFFER_METERS = 500
# Define number of time steps for mock data
TIME_STEPS = 5

# ==================== 1. Load Data ==================== #
# NOTE: The data loading logic remains the same as in the original script.
orig = pd.read_csv("../../data/WinterSeason1.csv")
river_100 = pd.read_csv("../data/Samples_100W.csv")

drop_cols = ['Stations','River','Lat','Long','geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')

# Train-test split
train_orig = orig.sample(10, random_state=42)
test_orig = orig.drop(train_orig.index)
train_combined = pd.concat([river_100, train_orig], ignore_index=True)

# ==================== 2. Collect ALL Rasters ==================== #
# NOTE: This code assumes the rasters are not time-series.
# The `generate_mock_time_series` function below will create a time-series
# for demonstration purposes. In a real-world scenario, you would load
# different raster data for each time step.
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDW/*.tif")

print(f"Using {len(raster_paths)} raster layers for CNN input.")
for r in raster_paths:
    print("  -", os.path.basename(r))

# ==================== 3. Create a Custom Data Generator ==================== #
def extract_patch_for_generator(coords, raster_files, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height):
    """
    Extracts a batch of patches from rasters for a given set of coordinates.
    This function is optimized to be called by the data generator for each batch.
    """
    patches = []
    # Loop through each coordinate pair in the batch
    for lon, lat in coords:
        channels = []
        # Loop through each raster file to get a single patch for each raster
        for rfile in raster_files:
            with rasterio.open(rfile) as src:
                try:
                    row, col = src.index(lon, lat)
                    win = Window(col - buffer_pixels_x, row - buffer_pixels_y, patch_width, patch_height)
                    arr = src.read(1, window=win, boundless=True, fill_value=0)
                    arr = arr.astype(np.float32)

                    if np.nanmax(arr) != 0:
                        arr /= np.nanmax(arr)
                except Exception as e:
                    print(f"Error processing {rfile} for coordinates ({lon}, {lat}): {e}")
                    arr = np.zeros((patch_width, patch_height), dtype=np.float32)
            channels.append(arr)
        patches.append(np.stack(channels, axis=-1))
    
    return np.array(patches)

def generate_mock_time_series(patches, time_steps):
    """
    Generates mock time-series data by stacking the same patch for 'time_steps'
    time steps. In a real-world scenario, you would have different rasters
    for each time step, and this function would not be needed.
    
    Input shape: (batch_size, height, width, channels)
    Output shape: (batch_size, time_steps, height, width, channels)
    """
    return np.stack([patches] * time_steps, axis=1)

class DataGenerator(Sequence):
    def __init__(self, coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size=4, shuffle=True, time_steps=TIME_STEPS, **kwargs):
        super().__init__(**kwargs)
        self.coords = coords
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.raster_paths = raster_paths
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.buffer_meters = buffer_meters
        self.time_steps = time_steps

        # Pre-calculate patch size from the first raster
        with rasterio.open(raster_paths[0]) as src:
            res_x, res_y = src.res
            self.buffer_pixels_x = int(self.buffer_meters / res_x)
            self.buffer_pixels_y = int(self.buffer_meters / res_y)
            self.patch_width = 2 * self.buffer_pixels_x
            self.patch_height = 2 * self.buffer_pixels_y

        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_coords = self.coords[batch_indices]
        batch_mlp = self.mlp_data[batch_indices]
        batch_gnn = self.gnn_data[batch_indices, :]
        batch_y = self.y[batch_indices]

        # Extract CNN patches for the current batch
        batch_cnn = extract_patch_for_generator(
            batch_coords,
            self.raster_paths,
            self.buffer_pixels_x,
            self.buffer_pixels_y,
            self.patch_width,
            self.patch_height
        )
        
        # Generate mock time-series data
        batch_cnn_time_series = generate_mock_time_series(batch_cnn, self.time_steps)

        return (batch_cnn_time_series, batch_mlp, batch_gnn), batch_y

# ==================== 4. Prepare GNN & MLP Input (only once) ==================== #
coords_train = train_combined[['Long','Lat']].values
coords_test = test_orig[['Long','Lat']].values
dist_mat_train = distance_matrix(coords_train, coords_train)
gnn_train = np.exp(-dist_mat_train/10)
dist_mat_test_train = distance_matrix(coords_test, coords_train)
gnn_test = np.exp(-dist_mat_test_train/10)

scaler = StandardScaler()
mlp_train = scaler.fit_transform(train_combined[numeric_cols])
mlp_test = scaler.transform(test_orig[numeric_cols])
y_train = train_combined['RI'].values
y_test = test_orig['RI'].values

# ==================== 5. Define Spatio-Temporal Model ==================== #
def build_spatio_temporal_model(time_series_shape, gnn_dim, mlp_dim):
    # Inputs for all branches
    cnn_input = Input(shape=time_series_shape, name="cnn_input")
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    
    # --- ConvLSTM2D Branch for Spatio-Temporal Data ---
    # `return_sequences=False` means we get the final output of the sequence
    conv_lstm_branch = ConvLSTM2D(
        filters=64,
        kernel_size=(3, 3),
        padding='same',
        return_sequences=False,
        activation='relu'
    )(cnn_input)
    
    # Flatten and get embedding
    cnn_embedding = Flatten()(conv_lstm_branch)
    cnn_embedding = Dense(128, activation="relu", name="cnn_embedding")(cnn_embedding)

    # --- MLP Branch with Embedding ---
    mlp_embedding = Dense(64, activation="relu")(mlp_input)
    mlp_embedding = Dense(32, activation="relu", name="mlp_embedding")(mlp_embedding)

    # --- GNN Branch with Embedding ---
    gnn_embedding = Dense(64, activation="relu")(gnn_input)
    gnn_embedding = Dense(32, activation="relu", name="gnn_embedding")(gnn_embedding)

    # --- Fusion ---
    combined_embedding = Concatenate(name="combined_embedding")([cnn_embedding, mlp_embedding, gnn_embedding])
    
    # Final dense layers for prediction
    f = Dense(128, activation="relu")(combined_embedding)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)

    # Build and compile the model
    model = Model(inputs=[cnn_input, mlp_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def evaluate_model(model, coords_test, mlp_test, gnn_test, y_test, raster_paths, buffer_meters, time_steps, batch_size=4):
    """
    Evaluates the model on test data and returns R², RMSE, and predictions.
    """
    num_samples = len(y_test)
    y_pred_list = []
    
    with rasterio.open(raster_paths[0]) as src:
        res_x, res_y = src.res
        buffer_pixels_x = int(buffer_meters / res_x)
        buffer_pixels_y = int(buffer_meters / res_y)
        patch_width = 2 * buffer_pixels_x
        patch_height = 2 * buffer_pixels_y

    for i in range(0, num_samples, batch_size):
        batch_coords = coords_test[i:i+batch_size]
        batch_mlp = mlp_test[i:i+batch_size]
        batch_gnn = gnn_test[i:i+batch_size, :]
        
        batch_cnn = extract_patch_for_generator(
            batch_coords, raster_paths, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height
        )
        batch_cnn_time_series = generate_mock_time_series(batch_cnn, time_steps)
        
        y_pred_list.append(model.predict((batch_cnn_time_series, batch_mlp, batch_gnn)).flatten())
        
    y_pred = np.concatenate(y_pred_list)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    return r2, rmse, y_pred

def calculate_mlp_feature_importance(model, coords_test, mlp_test, gnn_test, y_test, raster_paths, buffer_meters, time_steps, numeric_cols, batch_size=4):
    """
    Calculates feature importance for MLP features using a permutation-based approach.
    """
    # First, get baseline performance on the original test set
    _, baseline_rmse, _ = evaluate_model(model, coords_test, mlp_test, gnn_test, y_test, raster_paths, buffer_meters, time_steps, batch_size)
    
    feature_importances = {}
    
    # Iterate through each MLP feature
    for i, feature_name in enumerate(numeric_cols):
        print(f"Calculating importance for feature: {feature_name}")
        
        # Create a copy of the MLP test data to shuffle one feature
        shuffled_mlp_test = mlp_test.copy()
        
        # Shuffle the current feature's column
        np.random.shuffle(shuffled_mlp_test[:, i])
        
        # Evaluate model with shuffled data
        _, shuffled_rmse, _ = evaluate_model(model, coords_test, shuffled_mlp_test, gnn_test, y_test, raster_paths, buffer_meters, time_steps, batch_size)
        
        # The importance is the increase in RMSE
        importance = shuffled_rmse - baseline_rmse
        feature_importances[feature_name] = importance
        
    return feature_importances

# ==================== Run the Analysis ==================== #
# Redirect output to a string for later saving
old_stdout = sys.stdout
sys.stdout = captured_output = StringIO()

print("\n" + "="*80)
print(f"Analyzing CNN + LSTM Model with {TIME_STEPS} mock time steps")
print("="*80)

batch_size = 4
gnn_input_dim = len(coords_train)

# Calculate CNN patch shape based on the current buffer size
with rasterio.open(raster_paths[0]) as src:
    res_x, res_y = src.res
    buffer_pixels_x = int(BUFFER_METERS / res_x)
    patch_width = 2 * buffer_pixels_x
    time_series_shape = (TIME_STEPS, patch_width, patch_width, len(raster_paths))

mlp_input_dim = mlp_train.shape[1]

model = build_spatio_temporal_model(time_series_shape, gnn_input_dim, mlp_input_dim)
model.summary()

# ==================== 6. Create Data Generators ==================== #
train_generator = DataGenerator(
    coords=coords_train, mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
    raster_paths=raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size, shuffle=True
)

# ==================== 7. Train Model ==================== #
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    train_generator,
    epochs=1,
    verbose=1,
    callbacks=[early_stopping],
    validation_data=train_generator
)

# ==================== 8. Evaluate ==================== #
# Evaluate on training data
y_pred_train = model.predict(train_generator).flatten()
r2_train = r2_score(y_train[:len(y_pred_train)], y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train[:len(y_pred_train)], y_pred_train))

# Evaluate on test data
r2_test, rmse_test, y_pred_test = evaluate_model(model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, TIME_STEPS, batch_size=batch_size)

print(f"\n Spatio-Temporal Model Performance ({BUFFER_METERS}m):")
print(f"R² Train: {r2_train:.4f} | RMSE Train: {rmse_train:.4f}")
print(f"R² Test: {r2_test:.4f} | RMSE Test: {rmse_test:.4f}")

# ==================== 9. Save all info to a folder ==================== #
# Restore standard output
sys.stdout = old_stdout
printed_output = captured_output.getvalue()

output_folder = "cnn_lstm"
os.makedirs(output_folder, exist_ok=True)
print(f"\nCreating folder: '{output_folder}' and saving results...")

# Save the model
model_path = os.path.join(output_folder, "spatio_temporal_model.keras")
model.save(model_path)
print(f"Model saved to: {model_path}")

# Save the predictions and true labels
np.save(os.path.join(output_folder, "y_train.npy"), y_train)
np.save(os.path.join(output_folder, "y_test.npy"), y_test)
np.save(os.path.join(output_folder, "y_pred_train.npy"), y_pred_train)
np.save(os.path.join(output_folder, "y_pred_test.npy"), y_pred_test)
print(f"Predictions and true labels saved as .npy files.")

# Save the printed output to a text file
output_path = os.path.join(output_folder, "analysis_output.txt")
with open(output_path, "w") as f:
    f.write(printed_output)
print(f"Analysis results saved to: {output_path}")

# ==================== 10. Calculate and save feature importance ==================== #
print("\n" + "="*80)
print("Calculating MLP Feature Importance...")
print("="*80)

mlp_importance = calculate_mlp_feature_importance(
    model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, TIME_STEPS, numeric_cols, batch_size=batch_size
)

# Save feature importance to a pickle file
importance_path = os.path.join(output_folder, "mlp_feature_importance.pkl")
with open(importance_path, 'wb') as f:
    pickle.dump(mlp_importance, f)

print(f"MLP Feature importance saved to: {importance_path}")
print("Feature Importances (change in RMSE):")
for feature, importance in mlp_importance.items():
    print(f"  - {feature}: {importance:.4f}")

print("\nAll information successfully saved.")

# Garbage collect to free up memory now that everything is saved
del model, history, train_generator
gc.collect()

Using 26 raster layers for CNN input.
  - bui.tif
  - ndsi.tif
  - savi.tif
  - ndbsi.tif
  - ui.tif
  - ndwi.tif
  - ndbi.tif
  - awei.tif
  - evi.tif
  - mndwi.tif
  - ndvi.tif
  - LULC2020.tif
  - LULC2021.tif
  - LULC2022.tif
  - LULC2019.tif
  - LULC2018.tif
  - LULC2017.tif
  - Pb_R.tif
  - ClayR.tif
  - SandR.tif
  - CdR.tif
  - CrR.tif
  - AsR.tif
  - SiltR.tif
  - CuR.tif
  - NiR.tif


ValueError: Input contains NaN.

In [2]:
import pandas as pd
import numpy as np
import glob
import os
import rasterio
from rasterio.windows import Window
from scipy.spatial import distance_matrix
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, ConvLSTM2D, Flatten, Dense, Concatenate, Dropout, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import gc # Import garbage collector
import sys
from io import StringIO
import pickle

# Define the single buffer size to use
BUFFER_METERS = 500
# Define number of time steps for mock data
TIME_STEPS = 5
# Define number of folds for cross-validation
K_FOLDS = 5

# ==================== 1. Load Data ==================== #
# NOTE: The data loading logic remains the same as in the original script.
# These paths are relative to the user's environment.
orig = pd.read_csv("../../data/WinterSeason1.csv")
river_100 = pd.read_csv("../data/Samples_100W.csv")

drop_cols = ['Stations','River','Lat','Long','geometry']
numeric_cols = orig.drop(columns=drop_cols).columns.drop('RI')

# This is the full dataset that will be used for k-fold cross-validation
full_combined_data = pd.concat([river_100, orig], ignore_index=True)

# NOTE: The original script split the data first. We'll use K-fold on the full
# combined data for a more robust evaluation and reserve a separate, final test set.
# For simplicity, we'll keep the test_orig split from the original script
# to serve as a final, independent evaluation.
test_orig = full_combined_data.sample(frac=0.2, random_state=42)
train_combined = full_combined_data.drop(test_orig.index)

print(f"Total training data for K-Fold: {len(train_combined)} samples")
print(f"Total independent test data: {len(test_orig)} samples")

# ==================== 2. Collect ALL Rasters ==================== #
# NOTE: This code assumes the rasters are not time-series.
# The `generate_mock_time_series` function below will create a time-series
# for demonstration purposes. In a real-world scenario, you would load
# different raster data for each time step.
raster_paths = []
raster_paths += glob.glob("../CalIndices/*.tif")
raster_paths += glob.glob("../LULCMerged/*.tif")
raster_paths += glob.glob("../IDW/*.tif")

print(f"Using {len(raster_paths)} raster layers for CNN input.")
for r in raster_paths:
    print("  -", os.path.basename(r))

# ==================== 3. Create a Custom Data Generator ==================== #
def extract_patch_for_generator(coords, raster_files, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height):
    """
    Extracts a batch of patches from rasters for a given set of coordinates.
    This function is optimized to be called by the data generator for each batch.
    """
    patches = []
    # Loop through each coordinate pair in the batch
    for lon, lat in coords:
        channels = []
        # Loop through each raster file to get a single patch for each raster
        for rfile in raster_files:
            with rasterio.open(rfile) as src:
                try:
                    row, col = src.index(lon, lat)
                    win = Window(col - buffer_pixels_x, row - buffer_pixels_y, patch_width, patch_height)
                    arr = src.read(1, window=win, boundless=True, fill_value=0)
                    arr = arr.astype(np.float32)

                    if np.nanmax(arr) != 0:
                        arr /= np.nanmax(arr)
                except Exception as e:
                    print(f"Error processing {rfile} for coordinates ({lon}, {lat}): {e}")
                    arr = np.zeros((patch_width, patch_height), dtype=np.float32)
                channels.append(arr)
        patches.append(np.stack(channels, axis=-1))
    
    return np.array(patches)

def generate_mock_time_series(patches, time_steps):
    """
    Generates mock time-series data by stacking the same patch for 'time_steps'
    time steps.
    
    Input shape: (batch_size, height, width, channels)
    Output shape: (batch_size, time_steps, height, width, channels)
    """
    return np.stack([patches] * time_steps, axis=1)

class DataGenerator(Sequence):
    def __init__(self, coords, mlp_data, gnn_data, y, raster_paths, buffer_meters, batch_size=4, shuffle=True, time_steps=TIME_STEPS, **kwargs):
        super().__init__(**kwargs)
        self.coords = coords
        self.mlp_data = mlp_data
        self.gnn_data = gnn_data
        self.y = y
        self.raster_paths = raster_paths
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.y))
        self.buffer_meters = buffer_meters
        self.time_steps = time_steps

        # Pre-calculate patch size from the first raster
        with rasterio.open(raster_paths[0]) as src:
            res_x, res_y = src.res
            self.buffer_pixels_x = int(self.buffer_meters / res_x)
            self.buffer_pixels_y = int(self.buffer_meters / res_y)
            self.patch_width = 2 * self.buffer_pixels_x
            self.patch_height = 2 * self.buffer_pixels_y

        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.y) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        # Get batch indices
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Get batch data
        batch_coords = self.coords[batch_indices]
        batch_mlp = self.mlp_data[batch_indices]
        batch_gnn = self.gnn_data[batch_indices, :]
        batch_y = self.y[batch_indices]

        # Extract CNN patches for the current batch
        batch_cnn = extract_patch_for_generator(
            batch_coords,
            self.raster_paths,
            self.buffer_pixels_x,
            self.buffer_pixels_y,
            self.patch_width,
            self.patch_height
        )
        
        # Generate mock time-series data
        batch_cnn_time_series = generate_mock_time_series(batch_cnn, self.time_steps)

        return (batch_cnn_time_series, batch_mlp, batch_gnn), batch_y

# ==================== 4. Define Spatio-Temporal Model ==================== #
def build_spatio_temporal_model(time_series_shape, gnn_dim, mlp_dim):
    # Inputs for all branches
    cnn_input = Input(shape=time_series_shape, name="cnn_input")
    mlp_input = Input(shape=(mlp_dim,), name="mlp_input")
    gnn_input = Input(shape=(gnn_dim,), name="gnn_input")
    
    # --- ConvLSTM2D Branch for Spatio-Temporal Data ---
    conv_lstm_branch = ConvLSTM2D(
        filters=64,
        kernel_size=(3, 3),
        padding='same',
        return_sequences=False,
        activation='relu'
    )(cnn_input)
    
    # Flatten and get embedding
    cnn_embedding = Flatten()(conv_lstm_branch)
    cnn_embedding = Dense(128, activation="relu", name="cnn_embedding")(cnn_embedding)

    # --- MLP Branch with Embedding ---
    mlp_embedding = Dense(64, activation="relu")(mlp_input)
    mlp_embedding = Dense(32, activation="relu", name="mlp_embedding")(mlp_embedding)

    # --- GNN Branch with Embedding ---
    gnn_embedding = Dense(64, activation="relu")(gnn_input)
    gnn_embedding = Dense(32, activation="relu", name="gnn_embedding")(gnn_embedding)

    # --- Fusion ---
    combined_embedding = Concatenate(name="combined_embedding")([cnn_embedding, mlp_embedding, gnn_embedding])
    
    # Final dense layers for prediction
    f = Dense(128, activation="relu")(combined_embedding)
    f = Dropout(0.4)(f)
    f = Dense(64, activation="relu")(f)
    output = Dense(1, activation="linear", name="final_output")(f)

    # Build and compile the model
    model = Model(inputs=[cnn_input, mlp_input, gnn_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0005), loss="mse")
    return model

def evaluate_model(model, coords_data, mlp_data, gnn_data, y_data, raster_paths, buffer_meters, time_steps, batch_size=4):
    """
    Evaluates the model on provided data and returns R², RMSE, MAE, SMAPE, and predictions.
    """
    num_samples = len(y_data)
    y_pred_list = []
    
    # Calculate patch size
    with rasterio.open(raster_paths[0]) as src:
        res_x, res_y = src.res
        buffer_pixels_x = int(buffer_meters / res_x)
        buffer_pixels_y = int(buffer_meters / res_y)
        patch_width = 2 * buffer_pixels_x
        patch_height = 2 * buffer_pixels_y

    for i in range(0, num_samples, batch_size):
        batch_coords = coords_data[i:i+batch_size]
        batch_mlp = mlp_data[i:i+batch_size]
        batch_gnn = gnn_data[i:i+batch_size, :]
        
        batch_cnn = extract_patch_for_generator(
            batch_coords, raster_paths, buffer_pixels_x, buffer_pixels_y, patch_width, patch_height
        )
        batch_cnn_time_series = generate_mock_time_series(batch_cnn, time_steps)
        
        y_pred_list.append(model.predict((batch_cnn_time_series, batch_mlp, batch_gnn)).flatten())
        
    y_pred = np.concatenate(y_pred_list)
    
    r2 = r2_score(y_data, y_pred)
    rmse = np.sqrt(mean_squared_error(y_data, y_pred))
    mae = mean_absolute_error(y_data, y_pred)
    
    # Calculate SMAPE
    smape_val = np.mean(2 * np.abs(y_pred - y_data) / (np.abs(y_data) + np.abs(y_pred) + 1e-8)) * 100
    
    return r2, rmse, mae, smape_val, y_pred

def calculate_mlp_feature_importance(model, coords_test, mlp_test, gnn_test, y_test, raster_paths, buffer_meters, time_steps, numeric_cols, batch_size=4):
    """
    Calculates feature importance for MLP features using a permutation-based approach.
    """
    # First, get baseline performance on the original test set
    _, baseline_rmse, _, _, _ = evaluate_model(model, coords_test, mlp_test, gnn_test, y_test, raster_paths, buffer_meters, time_steps, batch_size)
    
    feature_importances = {}
    
    # Iterate through each MLP feature
    for i, feature_name in enumerate(numeric_cols):
        print(f"Calculating importance for feature: {feature_name}")
        
        # Create a copy of the MLP test data to shuffle one feature
        shuffled_mlp_test = mlp_test.copy()
        
        # Shuffle the current feature's column
        np.random.shuffle(shuffled_mlp_test[:, i])
        
        # Evaluate model with shuffled data
        _, shuffled_rmse, _, _, _ = evaluate_model(model, coords_test, shuffled_mlp_test, gnn_test, y_test, raster_paths, buffer_meters, time_steps, batch_size)
        
        # The importance is the increase in RMSE
        importance = shuffled_rmse - baseline_rmse
        feature_importances[feature_name] = importance
        
    return feature_importances

# ==================== Run the Analysis with K-Fold Cross-Validation ==================== #

print("\n" + "="*80)
print(f"Analyzing CNN + LSTM Model with {K_FOLDS}-Fold Cross-Validation")
print("="*80)

# Prepare all data once for the K-Fold splits
coords_all = train_combined[['Long', 'Lat']].values
gnn_all = np.exp(-distance_matrix(coords_all, coords_all) / 10)
scaler = StandardScaler()
mlp_all = scaler.fit_transform(train_combined[numeric_cols])
y_all = train_combined['RI'].values
batch_size = 4

# Initialize K-Fold splitter
kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)

# Lists to store metrics for each fold
r2_scores = []
mae_scores = []
rmse_scores = []
smape_scores = []
fold_models = []

# Calculate CNN patch shape based on the current buffer size
with rasterio.open(raster_paths[0]) as src:
    res_x, res_y = src.res
    buffer_pixels_x = int(BUFFER_METERS / res_x)
    patch_width = 2 * buffer_pixels_x
    time_series_shape = (TIME_STEPS, patch_width, patch_width, len(raster_paths))

mlp_input_dim = mlp_all.shape[1]
gnn_input_dim = gnn_all.shape[1]

# Create output directories
model_output_folder = os.path.join("models", "cnn_lstm")
os.makedirs(model_output_folder, exist_ok=True)

# Loop through each fold
for fold_num, (train_index, val_index) in enumerate(kf.split(coords_all)):
    print(f"\n--- Starting Fold {fold_num + 1}/{K_FOLDS} ---")
    
    # Split data for this fold
    coords_train, coords_val = coords_all[train_index], coords_all[val_index]
    mlp_train, mlp_val = mlp_all[train_index], mlp_all[val_index]
    gnn_train, gnn_val = gnn_all[train_index], gnn_all[val_index]
    y_train, y_val = y_all[train_index], y_all[val_index]
    
    # Build a fresh model for each fold
    model = build_spatio_temporal_model(time_series_shape, gnn_input_dim, mlp_input_dim)
    
    # Create data generators for this fold's split
    train_generator = DataGenerator(
        coords=coords_train, mlp_data=mlp_train, gnn_data=gnn_train, y=y_train,
        raster_paths=raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size, shuffle=True
    )
    val_generator = DataGenerator(
        coords=coords_val, mlp_data=mlp_val, gnn_data=gnn_val, y=y_val,
        raster_paths=raster_paths, buffer_meters=BUFFER_METERS, batch_size=batch_size, shuffle=False
    )
    
    # Train the model
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    
    history = model.fit(
        train_generator,
        epochs=1,
        verbose=1,
        callbacks=[early_stopping],
        validation_data=val_generator
    )
    
    # Evaluate the model on the validation set for this fold
    r2_val, rmse_val, mae_val, smape_val, _ = evaluate_model(
        model, coords_val, mlp_val, gnn_val, y_val, raster_paths, BUFFER_METERS, TIME_STEPS, batch_size=batch_size
    )
    
    print(f"Fold {fold_num + 1} Metrics:")
    print(f"  R²: {r2_val:.4f}")
    print(f"  RMSE: {rmse_val:.4f}")
    print(f"  MAE: {mae_val:.4f}")
    print(f"  SMAPE: {smape_val:.4f}%")
    
    r2_scores.append(r2_val)
    rmse_scores.append(rmse_val)
    mae_scores.append(mae_val)
    smape_scores.append(smape_val)
    
    # Save the model for the current fold
    model_path = os.path.join(model_output_folder, f"model{fold_num + 1}.keras")
#    model.save(model_path)
#    print(f"Model for Fold {fold_num + 1} saved to: {model_path}")
    fold_models.append(model)
    
    # Clear session and collect garbage to free up memory
    tf.keras.backend.clear_session()
    del model, history, train_generator, val_generator
    gc.collect()

# Calculate and print average metrics
print("\n" + "="*80)
print("K-Fold Cross-Validation Summary")
print("="*80)
print(f"Average R²: {np.mean(r2_scores):.4f} (+/- {np.std(r2_scores):.4f})")
print(f"Average RMSE: {np.mean(rmse_scores):.4f} (+/- {np.std(rmse_scores):.4f})")
print(f"Average MAE: {np.mean(mae_scores):.4f} (+/- {np.std(mae_scores):.4f})")
print(f"Average SMAPE: {np.mean(smape_scores):.4f}% (+/- {np.std(smape_scores):.4f})")

# Evaluate the best model (e.g., the last one trained) on the independent test set
print("\n" + "="*80)
print("Final Evaluation on Independent Test Set")
print("="*80)

# Prepare the final test data
coords_test = test_orig[['Long', 'Lat']].values
dist_mat_test_train = distance_matrix(coords_test, coords_all)
gnn_test = np.exp(-dist_mat_test_train/10)
mlp_test = scaler.transform(test_orig[numeric_cols])
y_test = test_orig['RI'].values

# Use the last trained model for final evaluation. A more robust approach would be to
# use an ensemble of all trained models, but this is simpler.
final_model = fold_models[-1]

r2_test, rmse_test, mae_test, smape_test, y_pred_test = evaluate_model(
    final_model, coords_test, mlp_test, gnn_test, y_test, raster_paths, BUFFER_METERS, TIME_STEPS, batch_size=batch_size
)

print(f"R² Test: {r2_test:.4f}")
print(f"RMSE Test: {rmse_test:.4f}")
print(f"MAE Test: {mae_test:.4f}")
print(f"SMAPE Test: {smape_test:.4f}%")

KeyError: "['Source'] not found in axis"