In [2]:
import rasterio
import rasterio.windows
import rasterio.features 
import numpy as np
import os
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points 
import pandas as pd
from sklearn.preprocessing import StandardScaler 
from scipy.spatial.distance import cdist # For calculating distances between points
import networkx as nx 

# --- Configuration ---
# Base directory for all your GIS data files
GIS_BASE_DIR = "/Users/rakibhhridoy/Five_Rivers/gis" # <--- IMPORTANT: Update this path to your GIS data directory

# Subdirectories for LULC and CalIndices TIFF files
LULC_DIR = os.path.join(GIS_BASE_DIR, "LULCMerged")
CAL_INDICES_DIR = os.path.join(GIS_BASE_DIR, "CalIndices")

# Paths to your point shapefiles
SAMPLING_POINTS_PATH = os.path.join(GIS_BASE_DIR, "sampling_point.shp")
BRICK_FIELD_POINTS_PATH = os.path.join(GIS_BASE_DIR, "brick_field_point.shp")
INDUSTRY_POINTS_PATH = os.path.join(GIS_BASE_DIR, "industry_point.shp")

# Output CSV file paths for intermediate and final processed data
OUTPUT_LULC_VARIATIONS_CSV = os.path.join(GIS_BASE_DIR, "LULC_5km_Variations.csv")
OUTPUT_HYDRO_PROPERTIES_CSV = os.path.join(GIS_BASE_DIR, "Hydrological_Properties.csv")
OUTPUT_RASTER_FEATURES_CSV = os.path.join(GIS_BASE_DIR, "Raster_Derived_Features.csv")
OUTPUT_COMBINED_FEATURES_CSV = os.path.join(GIS_BASE_DIR, "Combined_Features_Scaled.csv")

In [3]:
LULC_YEARS = list(range(2017, 2023)) # Covers 2017, 2018, 2019, 2020, 2021, 2022
UNIFORM_BUFFER_RADIUS_METERS = 5000 # 5 km

# CNN Patch Size (pixels) - for extracting image patches around points for CNN branch
# This is a pixel dimension, not a real-world radius.
CNN_PATCH_SIZE = 32 # e.g., 32x32 pixels

# GNN Edge Definition: Distance threshold for connecting nodes (sampling stations)
# This creates a proximity graph as an approximation of hydrological connectivity.
GNN_EDGE_DISTANCE_THRESHOLD_METERS = 5000 # 5 km for connecting stations

# List of raster indices to extract neighborhood statistics from
RASTER_INDICES_TO_EXTRACT = [
    "awei.tif", "bui.tif", "evi.tif", "mndwi.tif", "ndbi.tif",
    "ndbsi.tif", "ndsi.tif", "ndvi.tif", "ndwi.tif", "savi.tif", "ui.tif"
]
# Path to DEM file
DEM_PATH = os.path.join(GIS_BASE_DIR, "DEMF.tif")

In [4]:
# --- Helper Functions ---
# These functions are designed to encapsulate common geospatial operations,
# making the main pipeline cleaner and more modular.

def load_raster(filepath, reference_crs):
    """
    Loads a single raster file. If its CRS does not match the reference_crs,
    it reprojects the raster to the reference_crs.
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Raster file not found: {filepath}")
    
    with rasterio.open(filepath) as src:
        if src.crs != reference_crs:
            print(f"Reprojecting {os.path.basename(filepath)} from {src.crs} to {reference_crs}...")
            from rasterio.warp import reproject, Resampling
            
            # Calculate new transform and dimensions for reprojection
            transform, width, height = rasterio.windows.calculate_default_transform(
                src.crs, reference_crs, src.width, src.height, *src.bounds
            )
            reprojected_array = np.empty((height, width), dtype=src.dtypes)
            reproject(
                source=rasterio.band(src, 1),
                destination=reprojected_array,
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=reference_crs,
                resampling=Resampling.nearest # Use nearest for categorical LULC, bilinear for continuous DEM/indices
            )
            profile = src.profile.copy()
            profile.update({
                'crs': reference_crs,
                'transform': transform,
                'width': width,
                'height': height
            })
            return reprojected_array, profile
        else:
            return src.read(1), src.profile

def extract_neighborhood_stats(raster_array, raster_profile, point_geom, buffer_meters, stat_type='mean'):
    """
    Extracts neighborhood statistics (mean, stddev) from a raster for a given point buffer.
    This function crops the raster to the buffer's bounding box and then masks it with the circular buffer.
    """
    buffer_geom = point_geom.buffer(buffer_meters)
    
    # Get the window (bounding box in pixel coordinates) for the buffer
    window = rasterio.windows.from_bounds(*buffer_geom.bounds, transform=raster_profile['transform'])
    
    # Clamp window to raster dimensions to prevent out-of-bounds access
    row_start, row_stop = int(window.row_off), int(window.row_off + window.height)
    col_start, col_stop = int(window.col_off), int(window.col_off + window.width)
    
    row_start = max(0, row_start)
    row_stop = min(raster_array.shape[0], row_stop)
    col_start = max(0, col_start)
    col_stop = min(raster_array.shape[1], col_stop)

    if row_stop <= row_start or col_stop <= col_start:
        return np.nan # No valid overlap
    
    # Extract the subset of the raster array (rectangular crop)
    cropped_array = raster_array[row_start:row_stop, col_start:col_stop]
    # Create a new transform for this cropped array's extent
    cropped_transform = rasterio.windows.transform(window, raster_profile['transform'])

    # Rasterize the buffer geometry onto a new array of the same shape as cropped_array
    pixel_mask_for_buffer = rasterio.features.rasterize(
        [buffer_geom],
        out_shape=cropped_array.shape,
        transform=cropped_transform,
        fill=0,
        all_touched=False, # Only pixels strictly inside
        dtype=np.uint8
    ).astype(bool) # Convert to boolean mask (True for inside, False for outside)

    # Apply the pixel mask to the cropped array to get pixels within the circular buffer
    pixels_within_buffer = cropped_array[pixel_mask_for_buffer]
    
    if pixels_within_buffer.size > 0:
        if stat_type == 'mean':
            return np.mean(pixels_within_buffer)
        elif stat_type == 'std':
            return np.std(pixels_within_buffer)
        else:
            raise ValueError("Unsupported stat_type. Use 'mean' or 'std'.")
    else:
        return np.nan # No valid pixels in buffer

def extract_raster_patch(raster_array, raster_profile, point_geom, patch_size):
    """
    Extracts a square image patch around a point for CNN input.
    Handles boundary conditions by padding the raster.
    """
    # Convert point_geom to pixel coordinates
    col, row = raster_profile['transform'](point_geom.x, point_geom.y, op=~raster_profile['transform'])
    col, row = int(col), int(row)

    half_patch = patch_size // 2

    # Create a padded array to handle points near edges gracefully
    padded_array = np.pad(raster_array, half_patch, mode='reflect') 
    
    # Adjust coordinates for the padded array
    padded_row_start = row + half_patch - half_patch
    padded_row_end = row + half_patch + half_patch
    padded_col_start = col + half_patch - half_patch
    padded_col_end = col + half_patch + half_patch

    # Extract patch from padded array
    patch = padded_array[padded_row_start:padded_row_end, padded_col_start:padded_col_end]
    
    # Ensure the patch is exactly patch_size x patch_size
    if patch.shape != (patch_size, patch_size):
        print(f"Warning: Patch for point at ({point_geom.x}, {point_geom.y}) is not {patch_size}x{patch_size}. Returning NaN array.")
        return np.full((patch_size, patch_size), np.nan) # Return NaN array if extraction fails
    
    return patch

In [6]:
rainy_df = pd.read_csv("data/Hydro_LULC_Winter.csv")
rainy_df.head()

Unnamed: 0,Stations,River,Lat,Long,geometry,hydrological_dist_to_nearest_BF,num_upstream_BF,hydrological_dist_to_nearest_IND,num_upstream_IND,CrW,...,MW,SandW,SiltW,ClayW,FeW,variation_17_18,variation_18_19,variation_19_20,variation_20_21,variation_21_22
0,S1,Dhaleshwari,23.91026,90.229845,POINT (10044340.399756001 2742476.70627368),0.0002,6,0.00679,2,106.58,...,30.69,16,51,28,29400,14.396884,8.830319,11.964628,10.162627,10.226519
1,S2,Dhaleshwari,23.858227,90.240038,POINT (10045475.079325657 2736141.9436627394),2e-05,1,0.0,0,104.28,...,32.41,16,62,27,32100,12.920722,9.676226,13.489252,10.422661,11.249439
2,S3,Dhaleshwari,23.802571,90.24539,POINT (10046070.861240383 2729368.914036628),0.0002,6,0.0021,4,89.77,...,30.14,57,30,17,27970,11.966286,10.48311,15.459976,10.687158,11.446218
3,S4,Dhaleshwari,23.754298,90.246581,POINT (10046203.442753918 2723496.704977303),0.073026,2,0.027234,6,71.55,...,29.58,71,18,13,25780,21.385412,15.447605,20.766508,13.685776,13.207538
4,S5,Dhaleshwari,23.702157,90.277077,POINT (10049598.24194515 2717156.4153029486),0.074113,15,0.122163,5,100.15,...,33.78,75,19,9,36480,18.108141,14.249969,17.320514,12.567463,11.587011


In [7]:
rainy_df.columns

Index(['Stations', 'River', 'Lat', 'Long', 'geometry',
       'hydrological_dist_to_nearest_BF', 'num_upstream_BF',
       'hydrological_dist_to_nearest_IND', 'num_upstream_IND', 'CrW', 'NiW',
       'CuW', 'AsW', 'CdW', 'PbW', 'MW', 'SandW', 'SiltW', 'ClayW', 'FeW',
       'variation_17_18', 'variation_18_19', 'variation_19_20',
       'variation_20_21', 'variation_21_22'],
      dtype='object')

In [10]:
hm_col = ['Stations', 'River', 'Lat', 'Long', 'CrW', 'NiW',
       'CuW', 'AsW', 'CdW', 'PbW', 'FeW']

In [11]:
heavy_metal_df = rainy_df[hm_col]
heavy_metal_df.set_index('Stations', inplace=True)
heavy_metal_df.head()

Unnamed: 0_level_0,River,Lat,Long,CrW,NiW,CuW,AsW,CdW,PbW,FeW
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
S1,Dhaleshwari,23.91026,90.229845,106.58,34.35,69.95,9.12,4.43,105.9,29400
S2,Dhaleshwari,23.858227,90.240038,104.28,27.1,75.12,13.79,3.82,96.14,32100
S3,Dhaleshwari,23.802571,90.24539,89.77,59.33,71.13,26.17,2.95,77.36,27970
S4,Dhaleshwari,23.754298,90.246581,71.55,49.17,92.34,25.35,2.36,90.48,25780
S5,Dhaleshwari,23.702157,90.277077,100.15,50.68,100.22,28.37,2.1,79.1,36480


In [12]:
print("\n--- Step 1.2: Loading LULC Rasters and establishing Reference CRS/Transform ---")
lulc_rasters = {}
lulc_profiles = {}
reference_crs = None
reference_transform = None
full_raster_height = None
full_raster_width = None

try:
    # Use the first LULC file to get the reference CRS and transform
    first_lulc_file = os.path.join(LULC_DIR, "LULC2017.tif")
    if not os.path.exists(first_lulc_file):
        raise FileNotFoundError(f"Reference LULC file not found: {first_lulc_file}")

    with rasterio.open(first_lulc_file) as src:
        reference_crs = src.crs
        reference_transform = src.transform
        full_raster_height, full_raster_width = src.shape

    for year in LULC_YEARS:
        filepath = os.path.join(LULC_DIR, f"LULC{year}.tif")
        # Load and reproject if necessary using the helper function
        lulc_rasters[year], lulc_profiles[year] = load_raster(filepath, reference_crs)
        
    if not lulc_rasters:
        raise ValueError("No LULC rasters were loaded. Check LULC_DIR and file names.")
        
    print(f"Loaded LULC rasters for years: {list(lulc_rasters.keys())}")
    print(f"Reference CRS (from LULC): {reference_crs}")

except Exception as e:
    print(f"Error loading LULC rasters or establishing reference: {e}")
    exit()


--- Step 1.2: Loading LULC Rasters and establishing Reference CRS/Transform ---
Loaded LULC rasters for years: [2017, 2018, 2019, 2020, 2021, 2022]
Reference CRS (from LULC): EPSG:32646


In [13]:
# Step 1.3: Load Sampling Points (reprojected to reference_crs)
try:
    sampling_points_gdf = gpd.read_file(SAMPLING_POINTS_PATH)
    if 'Stations' not in sampling_points_gdf.columns:
        sampling_points_gdf['Stations'] = [f"S{i+1}" for i in range(len(sampling_points_gdf))]
    sampling_points_gdf.set_index('Stations', inplace=True)

    if sampling_points_gdf.crs != reference_crs:
        sampling_points_gdf = sampling_points_gdf.to_crs(reference_crs)
        print(f"Reprojected sampling points from {sampling_points_gdf.crs} to {reference_crs}.")
    print(f"Loaded {len(sampling_points_gdf)} sampling points.")

except Exception as e:
    print(f"Error loading or reprojecting sampling points: {e}")
    exit()

Reprojected sampling points from PROJCS["WGS 84 / UTM zone 46N",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",93],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["EPSG","32646"]] to EPSG:32646.
Loaded 17 sampling points.


## HYDRO & LULC

In [14]:
# --- Step 2: Calculate Hydrological and LULC Features ---
print("\n--- Step 2: Calculating Hydrological Properties and LULC Variations ---")

# Load Brick Field and Industry Points (reprojected to reference_crs)
brick_field_points_gdf = None
industry_points_gdf = None
try:
    brick_field_points_gdf = gpd.read_file(BRICK_FIELD_POINTS_PATH)
    industry_points_gdf = gpd.read_file(INDUSTRY_POINTS_PATH)

    if brick_field_points_gdf.crs != reference_crs:
        brick_field_points_gdf = brick_field_points_gdf.to_crs(reference_crs)
    if industry_points_gdf.crs != reference_crs:
        industry_points_gdf = industry_points_gdf.to_crs(reference_crs)
    print(f"Loaded {len(brick_field_points_gdf)} brick field points and {len(industry_points_gdf)} industry points.")
except Exception as e:
    print(f"Error loading or reprojecting brick field/industry points: {e}")
    exit()


--- Step 2: Calculating Hydrological Properties and LULC Variations ---
Loaded 270 brick field points and 195 industry points.


In [15]:
# Load DEM Raster
dem_array = None
dem_profile = None
try:
    dem_array, dem_profile = load_raster(DEM_PATH, reference_crs)
    print(f"Loaded DEM raster with shape: {dem_array.shape}")
except Exception as e:
    print(f"Error loading DEM raster: {e}")
    exit()

# Generate Binary Change Maps (Year-to-Year)
change_maps = {}
for i in range(len(LULC_YEARS) - 1):
    year1 = LULC_YEARS[i]
    year2 = LULC_YEARS[i+1]
    
    lulc_t1 = lulc_rasters[year1]
    lulc_t2 = lulc_rasters[year2]
    
    if lulc_t1.shape != lulc_t2.shape:
        print(f"Warning: LULC raster shapes mismatch for {year1} and {year2}. Skipping change map for this interval.")
        continue
    change_map_array = (lulc_t1 != lulc_t2).astype(np.uint8)
    change_maps[f'{year1}-{year2}'] = change_map_array

Loaded DEM raster with shape: (6266, 5764)


In [19]:
lulc_variation_results = []
hydro_properties_results = []
UNIFORM_BUFFER_RADIUS_METERS = 4200
OUTPUT_LULC_VARIATIONS_CSV = 'LULC_5km_Variations.csv'
OUTPUT_HYDRO_PROPERTIES_CSV = 'Hydrological_Properties.csv'


for station_id, point_row in sampling_points_gdf.iterrows():
    point_geom = point_row.geometry
    
    point_lulc_results = {'Stations': station_id}
    point_hydro_results = {'Stations': station_id}

    # Calculate LULC Variations
    for interval, change_map_array in change_maps.items():
        try:
            proportion_changed = extract_neighborhood_stats(
                change_map_array, lulc_profiles[LULC_YEARS[0]], 
                point_geom, UNIFORM_BUFFER_RADIUS_METERS, stat_type='mean'
            )
            point_lulc_results[f'variation_{interval}'] = proportion_changed * 100
        except Exception as e:
            print(f"Error processing LULC for point {station_id} for interval {interval}: {e}")
            point_lulc_results[f'variation_{interval}'] = np.nan
    lulc_variation_results.append(point_lulc_results)

    # Calculate Hydrological Properties
    try:
        # Correctly get DEM Point Value using the inverse transform
        dem_transform_inv = ~dem_profile['transform']
        col, row = dem_transform_inv * (point_geom.x, point_geom.y)

        if 0 <= row < dem_array.shape[0] and 0 <= col < dem_array.shape[1]:
            point_hydro_results['dem_point_value'] = dem_array[int(row), int(col)]
        else:
            point_hydro_results['dem_point_value'] = np.nan

        # DEM Mean and StdDev in 5km radius
        mean_dem = extract_neighborhood_stats(dem_array, dem_profile, point_geom, UNIFORM_BUFFER_RADIUS_METERS, 'mean')
        std_dem = extract_neighborhood_stats(dem_array, dem_profile, point_geom, UNIFORM_BUFFER_RADIUS_METERS, 'std')
        point_hydro_results['dem_mean_5km'] = mean_dem
        point_hydro_results['dem_std_5km'] = std_dem

        # Euclidean Distance to Nearest Brick Field
        if not brick_field_points_gdf.empty:
            nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
            distance_bf = point_geom.distance(nearest_bf_geom)
            point_hydro_results['dist_to_nearest_BF'] = distance_bf
        else:
            point_hydro_results['dist_to_nearest_BF'] = np.nan

        # Euclidean Distance to Nearest Industry
        if not industry_points_gdf.empty:
            nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
            distance_ind = point_geom.distance(nearest_ind_geom)
            point_hydro_results['dist_to_nearest_IND'] = distance_ind
        else:
            point_hydro_results['dist_to_nearest_IND'] = np.nan

        # Count sources within 5km circular influence radius
        # This serves as a proxy for "num_upstream" without a true hydrological network.
        bf_in_radius = brick_field_points_gdf[brick_field_points_gdf.within(point_geom.buffer(UNIFORM_BUFFER_RADIUS_METERS))]
        point_hydro_results['num_within_5km_BF'] = len(bf_in_radius)
        
        ind_in_radius = industry_points_gdf[industry_points_gdf.within(point_geom.buffer(UNIFORM_BUFFER_RADIUS_METERS))]
        point_hydro_results['num_within_5km_IND'] = len(ind_in_radius)

    except Exception as e:
        print(f"Error processing hydrological features for point {station_id}: {e}")
        for key in ['dem_point_value', 'dem_mean_5km', 'dem_std_5km', 'dist_to_nearest_BF', 'dist_to_nearest_IND', 'num_within_5km_BF', 'num_within_5km_IND']:
            point_hydro_results[key] = np.nan
    
    hydro_properties_results.append(point_hydro_results)


lulc_variations_df_calculated = pd.DataFrame(lulc_variation_results).set_index('Stations')
lulc_variations_df_calculated.to_csv(OUTPUT_LULC_VARIATIONS_CSV)
print(f"LULC Variations calculated and saved to {OUTPUT_LULC_VARIATIONS_CSV}")

hydro_properties_df_calculated = pd.DataFrame(hydro_properties_results).set_index('Stations')
hydro_properties_df_calculated.to_csv(OUTPUT_HYDRO_PROPERTIES_CSV)
print(f"Hydrological Properties calculated and saved to {OUTPUT_HYDRO_PROPERTIES_CSV}")
print("\nFirst 5 rows of Hydrological Properties:")
print(hydro_properties_df_calculated.head())
hydro_properties_df_calculated

  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_

LULC Variations calculated and saved to LULC_5km_Variations.csv
Hydrological Properties calculated and saved to Hydrological_Properties.csv

First 5 rows of Hydrological Properties:
          dem_point_value  dem_mean_5km  dem_std_5km  dist_to_nearest_BF  \
Stations                                                                   
S1                  -45.0    -44.545135     3.966224         2647.402248   
S2                  -51.0    -45.338905     3.547604         3113.557901   
S3                  -52.0    -46.650517     2.648768         1841.694727   
S4                  -53.0    -47.438057     2.525883         2737.503779   
S5                  -47.0    -48.027908     2.480211         1637.673006   

          dist_to_nearest_IND  num_within_5km_BF  num_within_5km_IND  
Stations                                                              
S1                 522.326002                  3                  10  
S2                3000.625941                  2                   2  
S

  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_

Unnamed: 0_level_0,dem_point_value,dem_mean_5km,dem_std_5km,dist_to_nearest_BF,dist_to_nearest_IND,num_within_5km_BF,num_within_5km_IND
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1,-45.0,-44.545135,3.966224,2647.402248,522.326002,3,10
S2,-51.0,-45.338905,3.547604,3113.557901,3000.625941,2,2
S3,-52.0,-46.650517,2.648768,1841.694727,1298.585986,9,7
S4,-53.0,-47.438057,2.525883,2737.503779,317.052883,5,4
S5,-47.0,-48.027908,2.480211,1637.673006,3307.441888,1,2
S6,-51.0,-48.676369,2.290917,3880.16797,1519.752716,2,4
S7,-50.0,-48.972412,2.109034,311.431998,4735.791556,28,0
S8,-50.0,-46.129311,4.607535,719.967325,936.147517,4,4
S9,-51.0,-46.096786,5.321898,1772.54035,3219.036625,6,1
S10,-45.0,-46.316338,5.068447,1448.820004,803.257359,27,3


In [20]:
# --- Step 4: Extract Raster-Derived Environmental Indices (5km radius) ---
print(f"\n--- Step 4: Extracting Raster-Derived Environmental Indices within {UNIFORM_BUFFER_RADIUS_METERS/1000}km radius ---")

raster_features_results = []

# Load all specified raster indices
loaded_indices = {}
for index_file in RASTER_INDICES_TO_EXTRACT:
    try:
        index_path = os.path.join(CAL_INDICES_DIR, index_file)
        
        # Use rasterio's built-in reprojection
        with rasterio.open(index_path) as src:
            # Reproject to target CRS if needed
            if src.crs != reference_crs:
                transform, width, height = rasterio.warp.calculate_default_transform(
                    src.crs, reference_crs, src.width, src.height, *src.bounds
                )
                array = np.empty((height, width))
                rasterio.warp.reproject(
                    src.read(1),
                    array,
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=reference_crs,
                    resampling=rasterio.warp.Resampling.bilinear
                )
                profile = src.profile.copy()
                profile.update({
                    'crs': reference_crs,
                    'transform': transform,
                    'width': width,
                    'height': height
                })
            else:
                array = src.read(1)
                profile = src.profile
            
        loaded_indices[index_file] = {'array': array, 'profile': profile}
        print(f"Successfully loaded {index_file}")
    except Exception as e:
        print(f"Error loading {index_file}: {str(e)}. Skipping.")
        continue

# Make sure sampling_points_gdf has a 'Stations' column
if 'Stations' not in sampling_points_gdf.columns:
    sampling_points_gdf['Stations'] = [f'S{i+1}' for i in range(len(sampling_points_gdf))]

for station_id, point_row in sampling_points_gdf.iterrows():
    point_geom = point_row.geometry
    station_name = point_row['Stations']  # Get station name from the row
    
    point_results = {'Stations': station_name}  # Use the station name from the DataFrame

    for index_file, data in loaded_indices.items():
        index_name = os.path.splitext(index_file)[0]  # Get base name without extension
        
        try:
            # Extract mean and std values
            mean_val = extract_neighborhood_stats(
                data['array'], 
                data['profile'], 
                point_geom, 
                UNIFORM_BUFFER_RADIUS_METERS, 
                'mean'
            )
            std_val = extract_neighborhood_stats(
                data['array'], 
                data['profile'], 
                point_geom, 
                UNIFORM_BUFFER_RADIUS_METERS, 
                'std'
            )
            
            point_results[f'{index_name}_Mean_5km'] = mean_val
            point_results[f'{index_name}_Std_5km'] = std_val
            
        except Exception as e:
            print(f"Error extracting {index_name} features for station {station_name}: {str(e)}")
            point_results[f'{index_name}_Mean_5km'] = np.nan
            point_results[f'{index_name}_Std_5km'] = np.nan
            
    raster_features_results.append(point_results)

# Create DataFrame and set 'Stations' as index
raster_features_df = pd.DataFrame(raster_features_results)
if not raster_features_df.empty:
    raster_features_df.set_index('Stations', inplace=True)
    raster_features_df.to_csv(OUTPUT_RASTER_FEATURES_CSV)
    print(f"Raster-Derived Features calculated and saved to {OUTPUT_RASTER_FEATURES_CSV}")
    print("\nFirst 5 rows of Raster-Derived Features:")
    print(raster_features_df.head())
else:
    print("Warning: No raster features were extracted. Output file not created.")


--- Step 4: Extracting Raster-Derived Environmental Indices within 4.2km radius ---
Successfully loaded awei.tif
Successfully loaded bui.tif
Successfully loaded evi.tif
Successfully loaded mndwi.tif
Successfully loaded ndbi.tif
Successfully loaded ndbsi.tif
Successfully loaded ndsi.tif
Successfully loaded ndvi.tif
Successfully loaded ndwi.tif
Successfully loaded savi.tif
Successfully loaded ui.tif
Raster-Derived Features calculated and saved to /Users/rakibhhridoy/Five_Rivers/gis/Raster_Derived_Features.csv

First 5 rows of Raster-Derived Features:
          awei_Mean_5km  awei_Std_5km  bui_Mean_5km  bui_Std_5km  \
Stations                                                           
S1            -0.969579      0.125666     -0.391875     0.268516   
S2            -0.975519      0.121869     -0.403841     0.271429   
S3            -0.968839      0.134986     -0.340932     0.295861   
S4            -0.966311      0.133928     -0.414045     0.287297   
S5            -0.975424      0.11493

In [21]:
raster_features_df

Unnamed: 0_level_0,awei_Mean_5km,awei_Std_5km,bui_Mean_5km,bui_Std_5km,evi_Mean_5km,evi_Std_5km,mndwi_Mean_5km,mndwi_Std_5km,ndbi_Mean_5km,ndbi_Std_5km,...,ndsi_Mean_5km,ndsi_Std_5km,ndvi_Mean_5km,ndvi_Std_5km,ndwi_Mean_5km,ndwi_Std_5km,savi_Mean_5km,savi_Std_5km,ui_Mean_5km,ui_Std_5km
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S1,-0.969579,0.125666,-0.391875,0.268516,0.36966,0.196753,-0.258457,0.084574,-0.06543,0.110392,...,-0.258457,0.084574,0.326525,0.166733,-0.31466,0.140318,0.268071,0.138524,-0.196237,0.144225
S2,-0.975519,0.121869,-0.403841,0.271429,0.382916,0.195662,-0.267288,0.083496,-0.06796,0.111399,...,-0.267288,0.083496,0.335921,0.168417,-0.325435,0.140863,0.278085,0.13828,-0.197253,0.149461
S3,-0.968839,0.134986,-0.340932,0.295861,0.350675,0.213069,-0.267594,0.087344,-0.038436,0.116189,...,-0.267594,0.087344,0.302695,0.186979,-0.297655,0.161455,0.254292,0.152978,-0.148642,0.159921
S4,-0.966311,0.133928,-0.414045,0.287297,0.40503,0.208361,-0.277331,0.084295,-0.066,0.109603,...,-0.277331,0.084295,0.348139,0.183284,-0.331705,0.160568,0.293939,0.149119,-0.196545,0.146529
S5,-0.975424,0.114931,-0.385784,0.276382,0.386913,0.206132,-0.277028,0.07494,-0.052412,0.102698,...,-0.277028,0.07494,0.333334,0.177635,-0.319583,0.149776,0.279229,0.146328,-0.181397,0.13916
S6,-0.970387,0.117693,-0.419937,0.27577,0.402775,0.207737,-0.266767,0.074856,-0.071562,0.109601,...,-0.266767,0.074856,0.348554,0.172189,-0.327271,0.145244,0.289568,0.145474,-0.206157,0.145823
S7,-0.962058,0.136722,-0.368875,0.276895,0.37261,0.204317,-0.267226,0.083069,-0.047519,0.109168,...,-0.267226,0.083069,0.321361,0.174628,-0.306425,0.148536,0.267837,0.144281,-0.166605,0.147925
S8,-0.911295,0.215607,-0.273451,0.281583,0.288726,0.219451,-0.22536,0.127602,-0.024565,0.117395,...,-0.22536,0.127602,0.249471,0.184091,-0.245368,0.164379,0.205817,0.15241,-0.123441,0.158337
S9,-0.908223,0.213117,-0.268975,0.298324,0.278238,0.225503,-0.212291,0.120741,-0.02925,0.12594,...,-0.212291,0.120741,0.240201,0.188328,-0.236469,0.16602,0.198267,0.157254,-0.128814,0.170339
S10,-0.925424,0.193496,-0.185043,0.296732,0.222989,0.217097,-0.211518,0.107006,0.006872,0.124079,...,-0.211518,0.107006,0.192188,0.185083,-0.201415,0.161885,0.158959,0.15216,-0.070563,0.169799


In [26]:
# --- Step 4: Extract Raster-Derived Environmental Indices (5km radius) ---
UNIFORM_BUFFER_RADIUS_METERS = 5000

raster_features_results = []

# Load all specified raster indices
loaded_indices = {}
for index_file in RASTER_INDICES_TO_EXTRACT:
    try:
        index_path = os.path.join(CAL_INDICES_DIR, index_file)
        
        # Use rasterio's built-in reprojection
        with rasterio.open(index_path) as src:
            # Reproject to target CRS if needed
            if src.crs != reference_crs:
                transform, width, height = rasterio.warp.calculate_default_transform(
                    src.crs, reference_crs, src.width, src.height, *src.bounds
                )
                array = np.empty((height, width))
                rasterio.warp.reproject(
                    src.read(1),
                    array,
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=reference_crs,
                    resampling=rasterio.warp.Resampling.bilinear
                )
                profile = src.profile.copy()
                profile.update({
                    'crs': reference_crs,
                    'transform': transform,
                    'width': width,
                    'height': height
                })
            else:
                array = src.read(1)
                profile = src.profile
            
        loaded_indices[index_file] = {'array': array, 'profile': profile}
        print(f"Successfully loaded {index_file}")
    except Exception as e:
        print(f"Error loading {index_file}: {str(e)}. Skipping.")
        continue

# Make sure sampling_points_gdf has a 'Stations' column
if 'Stations' not in sampling_points_gdf.columns:
    sampling_points_gdf['Stations'] = [f'S{i+1}' for i in range(len(sampling_points_gdf))]

for station_id, point_row in sampling_points_gdf.iterrows():
    point_geom = point_row.geometry
    station_name = point_row['Stations']  # Get station name from the row
    
    point_results = {'Stations': station_name}  # Use the station name from the DataFrame

    for index_file, data in loaded_indices.items():
        index_name = os.path.splitext(index_file)[0]  # Get base name without extension
        
        try:
            # Extract mean and std values
            mean_val = extract_neighborhood_stats(
                data['array'], 
                data['profile'], 
                point_geom, 
                UNIFORM_BUFFER_RADIUS_METERS, 
                'mean'
            )
            std_val = extract_neighborhood_stats(
                data['array'], 
                data['profile'], 
                point_geom, 
                UNIFORM_BUFFER_RADIUS_METERS, 
                'std'
            )
            
            point_results[f'{index_name}_Mean_5km'] = mean_val
            point_results[f'{index_name}_Std_5km'] = std_val
            
        except Exception as e:
            print(f"Error extracting {index_name} features for station {station_name}: {str(e)}")
            point_results[f'{index_name}_Mean_5km'] = np.nan
            point_results[f'{index_name}_Std_5km'] = np.nan
            
    raster_features_results.append(point_results)

# Create DataFrame and set 'Stations' as index
raster_features_df = pd.DataFrame(raster_features_results)
if not raster_features_df.empty:
    raster_features_df.set_index('Stations', inplace=True)
    raster_features_df.to_csv(OUTPUT_RASTER_FEATURES_CSV)
    print(f"Raster-Derived Features calculated and saved to {OUTPUT_RASTER_FEATURES_CSV}")
    print("\nFirst 5 rows of Raster-Derived Features:")
    print(raster_features_df.head())
else:
    print("Warning: No raster features were extracted. Output file not created.")
raster_features_df

Successfully loaded awei.tif
Successfully loaded bui.tif
Successfully loaded evi.tif
Successfully loaded mndwi.tif
Successfully loaded ndbi.tif
Successfully loaded ndbsi.tif
Successfully loaded ndsi.tif
Successfully loaded ndvi.tif
Successfully loaded ndwi.tif
Successfully loaded savi.tif
Successfully loaded ui.tif
Raster-Derived Features calculated and saved to /Users/rakibhhridoy/Five_Rivers/gis/Raster_Derived_Features.csv

First 5 rows of Raster-Derived Features:
          awei_Mean_5km  awei_Std_5km  bui_Mean_5km  bui_Std_5km  \
Stations                                                           
S1            -0.969734      0.125220     -0.392400     0.268581   
S2            -0.975545      0.121653     -0.404656     0.270563   
S3            -0.969130      0.133851     -0.344786     0.295459   
S4            -0.966623      0.133369     -0.413579     0.287817   
S5            -0.975206      0.115340     -0.385491     0.277192   

          evi_Mean_5km  evi_Std_5km  mndwi_Mean_5km 

Unnamed: 0_level_0,awei_Mean_5km,awei_Std_5km,bui_Mean_5km,bui_Std_5km,evi_Mean_5km,evi_Std_5km,mndwi_Mean_5km,mndwi_Std_5km,ndbi_Mean_5km,ndbi_Std_5km,...,ndsi_Mean_5km,ndsi_Std_5km,ndvi_Mean_5km,ndvi_Std_5km,ndwi_Mean_5km,ndwi_Std_5km,savi_Mean_5km,savi_Std_5km,ui_Mean_5km,ui_Std_5km
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S1,-0.969734,0.12522,-0.3924,0.268581,0.370143,0.196729,-0.258778,0.084377,-0.065579,0.110679,...,-0.258778,0.084377,0.326918,0.16653,-0.315127,0.139971,0.268463,0.138414,-0.196683,0.144285
S2,-0.975545,0.121653,-0.404656,0.270563,0.38359,0.195119,-0.267662,0.083385,-0.068138,0.11101,...,-0.267662,0.083385,0.336557,0.167914,-0.325971,0.140518,0.278609,0.137878,-0.197708,0.148793
S3,-0.96913,0.133851,-0.344786,0.295459,0.353004,0.212681,-0.268019,0.086924,-0.040037,0.116289,...,-0.268019,0.086924,0.304957,0.186484,-0.299539,0.160875,0.256057,0.152663,-0.15111,0.159738
S4,-0.966623,0.133369,-0.413579,0.287817,0.404575,0.208671,-0.277133,0.084082,-0.065932,0.110034,...,-0.277133,0.084082,0.347748,0.18341,-0.331451,0.160435,0.293582,0.149312,-0.19633,0.147066
S5,-0.975206,0.11534,-0.385491,0.277192,0.386739,0.206494,-0.27688,0.075112,-0.052333,0.103177,...,-0.27688,0.075112,0.333129,0.178004,-0.319351,0.150169,0.279081,0.14661,-0.18114,0.139715
S6,-0.970859,0.116927,-0.420915,0.275077,0.403468,0.207132,-0.26724,0.074591,-0.0718,0.109332,...,-0.26724,0.074591,0.349287,0.171751,-0.327956,0.144789,0.290144,0.145079,-0.206604,0.145446
S7,-0.962687,0.135294,-0.3697,0.276092,0.373285,0.203899,-0.267627,0.08263,-0.047726,0.108989,...,-0.267627,0.08263,0.321981,0.174021,-0.307016,0.147871,0.268282,0.143934,-0.167008,0.147687
S8,-0.912257,0.214371,-0.273639,0.281524,0.289075,0.219483,-0.225617,0.12709,-0.024594,0.1174,...,-0.225617,0.12709,0.249634,0.183916,-0.245642,0.164066,0.205987,0.152365,-0.123514,0.158304
S9,-0.911569,0.209866,-0.268629,0.297243,0.27909,0.224655,-0.214845,0.119808,-0.02806,0.125581,...,-0.214845,0.119808,0.241047,0.187538,-0.237883,0.165049,0.19898,0.156682,-0.127595,0.169854
S10,-0.926832,0.191991,-0.186091,0.295221,0.22411,0.216209,-0.21266,0.106769,0.006925,0.123143,...,-0.21266,0.106769,0.193279,0.18446,-0.202531,0.161284,0.15981,0.151603,-0.070946,0.168767


In [22]:
# --- Step 4: Consolidate All Features, Scale Data, and Create Multi-Modal Inputs ---
print("\n--- Step 4: Consolidating All Features and Creating Multi-Modal Inputs ---")

# Step 4.1: Consolidate all dataframes
combined_features_df = heavy_metal_df.copy()
combined_features_df = combined_features_df.merge(lulc_variations_df_calculated, left_index=True, right_index=True, how='left')
combined_features_df = combined_features_df.merge(hydro_properties_df_calculated, left_index=True, right_index=True, how='left')
combined_features_df = combined_features_df.merge(raster_features_df, left_index=True, right_index=True, how='left')
combined_features_df.to_csv(OUTPUT_COMBINED_FEATURES_CSV)


--- Step 4: Consolidating All Features and Creating Multi-Modal Inputs ---


In [None]:
print(f"Combined features shape: {combined_features_df.shape}")
print("\nFirst 5 rows of Combined Features:")
print(combined_features_df)

# Step 4.2: Prepare Inputs for each model type
# Split data into features (X) and target (y)
X_df = combined_features_df.drop(columns=['AsR']) # Example: Dropping heavy metal columns
y_df = combined_features_df['AsR']
print(f"Features (X) shape: {X_df.shape}, Targets (y) shape: {y_df.shape}")

In [30]:
# MLP Input: Scaled Tabular Data
# Drop non-numerical columns for scaling
X_tabular_numeric = X_df.select_dtypes(include=np.number)
scaler = StandardScaler()
X_mlp_input = scaler.fit_transform(X_tabular_numeric)
print(f"MLP Input (Scaled Tabular Features) shape: {X_mlp_input.shape}")

MLP Input (Scaled Tabular Features) shape: (17, 36)


In [34]:
def extract_raster_patch(raster_array, raster_profile, point_geom, patch_size):
    """
    Extracts a square image patch around a point for CNN input.
    Handles boundary conditions by padding the raster.
    """
    # Convert point_geom to pixel coordinates
    transform = raster_profile['transform']
    col, row = ~transform * (point_geom.x, point_geom.y)  # Correct coordinate transformation
    col, row = int(round(col)), int(round(row))
    
    half_patch = patch_size // 2
    
    # Get array dimensions
    height, width = raster_array.shape
    
    # Calculate bounds with padding if needed
    row_start = max(0, row - half_patch)
    row_end = min(height, row + half_patch + 1)
    col_start = max(0, col - half_patch)
    col_end = min(width, col + half_patch + 1)
    
    # Extract the patch
    patch = raster_array[row_start:row_end, col_start:col_end]
    
    # Pad if necessary (when near edges)
    pad_width = (
        (max(0, half_patch - row), max(0, row + half_patch + 1 - height)),
        (max(0, half_patch - col), max(0, col + half_patch + 1 - width))
    )
    
    if any(p > 0 for p in sum(pad_width, ())):
        patch = np.pad(patch, pad_width, mode='constant', constant_values=0)
    
    return patch

# CNN Input: Stacked Raster Patches
X_cnn_input = {}
for station_id, point_row in sampling_points_gdf.iterrows():
    point_geom = point_row.geometry
    patches = []
    
    # Create a patch for each raster (DEM, LULC2022, all indices)
    for year in LULC_YEARS:
        try:
            lulc_array = lulc_rasters[year]
            lulc_profile = lulc_profiles[year]
            patch = extract_raster_patch(lulc_array, lulc_profile, point_geom, CNN_PATCH_SIZE)
            patches.append(patch)
        except Exception as e:
            print(f"Error extracting LULC {year} patch for station {station_id}: {str(e)}")
            patches.append(np.zeros((CNN_PATCH_SIZE, CNN_PATCH_SIZE)))
    
    try:
        dem_patch = extract_raster_patch(dem_array, dem_profile, point_geom, CNN_PATCH_SIZE)
        patches.append(dem_patch)
    except Exception as e:
        print(f"Error extracting DEM patch for station {station_id}: {str(e)}")
        patches.append(np.zeros((CNN_PATCH_SIZE, CNN_PATCH_SIZE)))
    
    for index_file, data in loaded_indices.items():
        try:
            index_patch = extract_raster_patch(data['array'], data['profile'], point_geom, CNN_PATCH_SIZE)
            patches.append(index_patch)
        except Exception as e:
            print(f"Error extracting {index_file} patch for station {station_id}: {str(e)}")
            patches.append(np.zeros((CNN_PATCH_SIZE, CNN_PATCH_SIZE)))
    
    # Stack the patches to create a multi-channel image input
    try:
        stacked_patches = np.stack(patches, axis=-1)
        # Handle NaN values if any patches failed to extract correctly
        stacked_patches = np.nan_to_num(stacked_patches, nan=0.0)
        X_cnn_input[station_id] = stacked_patches
    except Exception as e:
        print(f"Error stacking patches for station {station_id}: {str(e)}")
        X_cnn_input[station_id] = np.zeros((CNN_PATCH_SIZE, CNN_PATCH_SIZE, len(patches)))

# Convert to numpy array
X_cnn_input_list = np.array(list(X_cnn_input.values()))
print(f"CNN Input (Stacked Raster Patches) shape: {X_cnn_input_list.shape}")

CNN Input (Stacked Raster Patches) shape: (17, 33, 33, 18)


In [35]:
# GNN Input: Graph Structure and Node Features
# Create a proximity graph based on GNN_EDGE_DISTANCE_THRESHOLD_METERS
G = nx.Graph()
station_points = {station: point.coords[0] for station, point in sampling_points_gdf['geometry'].items()}

# Add nodes with their features
for station_id, features in X_tabular_numeric.iterrows():
    G.add_node(station_id, features=features.to_dict())

# Add edges based on proximity
station_ids = list(station_points.keys())
distances = cdist(list(station_points.values()), list(station_points.values()))
for i in range(len(station_ids)):
    for j in range(i + 1, len(station_ids)):
        if distances[i, j] <= GNN_EDGE_DISTANCE_THRESHOLD_METERS:
            G.add_edge(station_ids[i], station_ids[j], weight=distances[i, j])

print(f"GNN Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")


GNN Graph created with 17 nodes and 2 edges.


In [49]:
y_df, X_df

(Stations
 S1      7.96
 S2      9.88
 S3     15.48
 S4     18.77
 S5     20.96
 S6     10.42
 S7     19.88
 S8     11.36
 S9     10.78
 S10    18.90
 S11     9.88
 S12     7.98
 S13     6.67
 S14     6.90
 S15    15.71
 S16    13.92
 S17    15.84
 Name: AsR, dtype: float64,
                  River        Lat       Long    CrR    NiR     CuR    CdR  \
 Stations                                                                    
 S1         Dhaleshwari  23.910260  90.229845  92.69  19.18   40.34   2.66   
 S2         Dhaleshwari  23.858227  90.240038  88.40  17.21   41.56   2.97   
 S3         Dhaleshwari  23.802571  90.245390  66.92  37.52   49.47   2.10   
 S4         Dhaleshwari  23.754298  90.246581  55.56  26.08   69.77   1.79   
 S5         Dhaleshwari  23.702157  90.277077  64.50  30.62   73.19   1.45   
 S6         Dhaleshwari  23.657826  90.317763  60.35  11.72   98.28   0.82   
 S7         Dhaleshwari  23.628000  90.388647  69.37  14.95   96.72   0.97   
 S8               Tura

In [50]:
X_df = X_df.drop(columns=["River", "Lat", "Long"])
X_df.head()

Unnamed: 0_level_0,CrR,NiR,CuR,CdR,PbR,FeR,variation_2017-2018,variation_2018-2019,variation_2019-2020,variation_2020-2021,...,ndsi_Mean_5km,ndsi_Std_5km,ndvi_Mean_5km,ndvi_Std_5km,ndwi_Mean_5km,ndwi_Std_5km,savi_Mean_5km,savi_Std_5km,ui_Mean_5km,ui_Std_5km
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S1,92.69,19.18,40.34,2.66,50.73,26700,14.396884,8.830319,11.964628,10.162627,...,-0.258778,0.084377,0.326918,0.16653,-0.315127,0.139971,0.268463,0.138414,-0.196683,0.144285
S2,88.4,17.21,41.56,2.97,38.9,34970,12.920722,9.676226,13.489252,10.422661,...,-0.267662,0.083385,0.336557,0.167914,-0.325971,0.140518,0.278609,0.137878,-0.197708,0.148793
S3,66.92,37.52,49.47,2.1,32.79,23970,11.966286,10.48311,15.459976,10.687158,...,-0.268019,0.086924,0.304957,0.186484,-0.299539,0.160875,0.256057,0.152663,-0.15111,0.159738
S4,55.56,26.08,69.77,1.79,43.4,23990,21.385412,15.447605,20.766508,13.685776,...,-0.277133,0.084082,0.347748,0.18341,-0.331451,0.160435,0.293582,0.149312,-0.19633,0.147066
S5,64.5,30.62,73.19,1.45,53.55,35130,18.108141,14.249969,17.320514,12.567463,...,-0.27688,0.075112,0.333129,0.178004,-0.319351,0.150169,0.279081,0.14661,-0.18114,0.139715


In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# --- Placeholder Tabular Data (REPLACE THIS WITH YOUR ACTUAL DATA) ---
# This data is for demonstration and should be replaced with your loaded X_df and y_df.
# Example: 100 samples and 20 features
X_df = pd.DataFrame(np.random.rand(100, 20), columns=[f'feature_{i}' for i in range(20)])
y_df = pd.DataFrame(np.random.rand(100, 1) * 100, columns=['target'])

# --- Data Preparation ---
# CRITICAL FIX: Ensure X_df and y_df have the same number of samples
if len(X_df) != len(y_df):
    raise ValueError("X_df and y_df must have the same number of rows.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

# Scale the training and test features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Modeling with Tabular Data ---

# 1. Classical Machine Learning Model (Random Forest)
print("--- Training and Evaluating: Random Forest ---")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# The .ravel() call is correct for scikit-learn's target variable format
rf_model.fit(X_train_scaled, y_train.values.ravel())
rf_predictions = rf_model.predict(X_test_scaled)

print(f"R-squared: {r2_score(y_test, rf_predictions):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, rf_predictions):.4f}")

# 2. Hybrid ML + PMF Model (Conceptual)
print("\n--- Training and Evaluating: Hybrid RF + PMF ---")
# Replace this placeholder `pmf_factors` with your actual PMF results.
pmf_factors = np.random.rand(len(X_df), 3) # Assume 3 PMF factors as new features

# CRITICAL FIX: Ensure the number of rows in pmf_factors matches X_df
if len(X_df) != len(pmf_factors):
    raise ValueError("X_df and pmf_factors must have the same number of rows.")

X_hybrid = np.hstack([X_df.values, pmf_factors])

X_hybrid_train, X_hybrid_test, _, _ = train_test_split(X_hybrid, y_df, test_size=0.2, random_state=42)

rf_hybrid = RandomForestRegressor(n_estimators=100, random_state=42)
rf_hybrid.fit(X_hybrid_train, y_train.values.ravel())
rf_hybrid_predictions = rf_hybrid.predict(X_hybrid_test)

print(f"R-squared: {r2_score(y_test, rf_hybrid_predictions):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, rf_hybrid_predictions):.4f}")

--- Training and Evaluating: Random Forest ---
R-squared: -0.0642
Mean Squared Error: 740.9398

--- Training and Evaluating: Hybrid RF + PMF ---
R-squared: -0.1579
Mean Squared Error: 806.1476


## CNN-GNN-MLP

In [26]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import os
import rasterio
from rasterio.transform import Affine
from shapely.geometry import Point
from glob import glob

# --- 1. Data Loading and Preparation Functions ---

def load_combined_features(file_path):
    """
    Loads the main tabular data from the Combined_Features_Rainy.csv file.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded {file_path}")
        return df
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please ensure the path is correct.")
        return None

def extract_raster_patch(raster_array, raster_profile, point_geom, patch_size):
    """
    Extracts a square image patch around a point for CNN input.
    Handles boundary conditions by padding the raster.
    """
    try:
        # Get pixel coordinates from georeferenced coordinates
        col, row = ~raster_profile['transform'] * (point_geom.x, point_geom.y)
        col, row = int(col), int(row)
        
        half_patch = patch_size // 2
        start_row, end_row = row - half_patch, row + half_patch
        start_col, end_col = col - half_patch, col + half_patch

        # Pad the array to handle points near the edge
        padded_array = np.pad(
            raster_array, 
            half_patch, 
            mode='constant', 
            constant_values=raster_profile.get('nodata', 0.0)
        )

        padded_start_row = start_row + half_patch
        padded_end_row = end_row + half_patch
        padded_start_col = start_col + half_patch
        padded_end_col = end_col + half_patch

        patch = padded_array[padded_start_row:padded_end_row, padded_start_col:padded_end_col]
        
        return patch

    except Exception as e:
        print(f"Error extracting patch: {e}")
        return np.zeros((patch_size, patch_size))


def prepare_cnn_input(df, raster_base_path, patch_size=32):
    """
    Loads all rasters and creates a stacked, multi-channel input for the CNN.
    """
    print("\nPreparing CNN input...")
    
    # Get all .tif files from the specified directories
    indices_files = sorted(glob(os.path.join(raster_base_path, 'CalIndices', '*.tif')))
    lulc_files = sorted(glob(os.path.join(raster_base_path, 'LULCMerged', '*.tif')))
    
    all_raster_files = indices_files + lulc_files
    
    if not all_raster_files:
        raise FileNotFoundError("No .tif files found in Raster/CalIndices or Raster/LULCMerged.")

    # Load all raster data and profiles into memory
    loaded_rasters = {}
    for file_path in all_raster_files:
        with rasterio.open(file_path) as src:
            loaded_rasters[file_path] = {
                'array': src.read(1),
                'profile': src.profile
            }
    
    # Check for consistency
    if not loaded_rasters:
        raise ValueError("No rasters could be loaded.")
    
    # Extract patches for each station and stack them
    cnn_input_dict = {}
    for index, row in df.iterrows():
        point_geom = Point(row['Long'], row['Lat'])
        patches = []
        for file_path, data in loaded_rasters.items():
            patches.append(extract_raster_patch(data['array'], data['profile'], point_geom, patch_size))

        stacked_patches = np.stack(patches, axis=-1)
        stacked_patches[np.isnan(stacked_patches)] = 0.0 # Handle NaNs
        cnn_input_dict[index] = stacked_patches
    
    cnn_input_array = np.array(list(cnn_input_dict.values()))
    print(f"CNN Input shape: {cnn_input_array.shape}")
    return cnn_input_array

def prepare_gnn_input(df, node_features, distance_threshold=5000):
    """
    Prepares the graph data (adjacency matrix and node features) for the GNN.
    The graph is created based on Euclidean distance between stations.
    
    Args:
        df (pd.DataFrame): DataFrame containing 'Lat' and 'Long' for spatial graph construction.
        node_features (np.ndarray): The tabular data to be used as node features.
        distance_threshold (int): The maximum distance (in meters) to form an edge.
    """
    print("\nPreparing GNN input...")
    num_samples = len(df)
    
    # Use the provided features as node features
    # These are the scaled hydrological and indices columns as requested
    
    # Create a distance-based adjacency matrix using the sampling locations
    coords = df[['Long', 'Lat']].values
    dist_matrix = np.sqrt(np.sum((coords[:, np.newaxis] - coords[np.newaxis, :])**2, axis=2))
    
    # Create adjacency matrix based on a distance threshold (e.g., 5km)
    # This forms a "strong" graph by connecting nearby sampling sites.
    adjacency_matrix = (dist_matrix < distance_threshold).astype(int)
    np.fill_diagonal(adjacency_matrix, 0)
    
    # Convert to sparse tensor for efficient GNN processing
    adjacency_matrix_sparse = tf.sparse.from_dense(adjacency_matrix)
    print("GNN graph data prepared.")
    return node_features, adjacency_matrix_sparse


In [27]:
# --- 2. Custom GNN Layer ---
# A simplified graph convolutional layer for neighborhood aggregation.
class CustomGraphConv(layers.Layer):
    def __init__(self, units, activation='relu', **kwargs):
        super(CustomGraphConv, self).__init__(**kwargs)
        self.units = units
        self.activation = tf.keras.activations.get(activation)

    def build(self, input_shape):
        self.kernel = self.add_weight(
            shape=(input_shape[1], self.units),
            initializer="glorot_uniform",
            trainable=True,
            name="kernel"
        )
        self.bias = self.add_weight(
            shape=(self.units,),
            initializer="zeros",
            trainable=True,
            name="bias"
        )
        super(CustomGraphConv, self).build(input_shape)

    def call(self, inputs):
        features, adj_matrix = inputs
        aggregated_features = tf.sparse.sparse_dense_matmul(adj_matrix, features)
        output = tf.matmul(aggregated_features, self.kernel) + self.bias
        return self.activation(output)

    def get_config(self):
        config = super(CustomGraphConv, self).get_config()
        config.update({
            "units": self.units,
            "activation": self.activation,
        })
        return config



In [28]:
# --- 3. Main Script Execution ---

# --- Data Paths and Parameters ---
csv_path = 'Combined_Features_Rainy.csv'
raster_base_path = 'Raster'
target_column = 'AsR'
patch_size = 32
gnn_distance_threshold = 4200  # meters

# Load and prepare data
df = load_combined_features(csv_path)
if df is None:
    exit()

# Define features and target
y_target = df[target_column].values.reshape(-1, 1)

# MLP features: all columns except metadata and targets
mlp_feature_cols = [col for col in df.columns if col not in ['Stations', 'River', 'Lat', 'Long', 'CrR', 'NiR', 'CuR', 'AsR', 'CdR', 'PbR', 'FeR']]
X_mlp_df = df[mlp_feature_cols]

# GNN features: hydrological and indices columns, as requested
# These are a subset of the MLP features
gnn_feature_cols = [
    'variation_2017-2018', 'variation_2018-2019', 'variation_2019-2020', 'variation_2020-2021', 'variation_2021-2022',
    'dem_point_value', 'dem_mean_5km', 'dem_std_5km', 'dist_to_nearest_BF', 'dist_to_nearest_IND',
    'num_within_5km_BF', 'num_within_5km_IND',
    'awei_Mean_5km', 'awei_Std_5km', 'bui_Mean_5km', 'bui_Std_5km', 'evi_Mean_5km', 'evi_Std_5km',
    'mndwi_Mean_5km', 'mndwi_Std_5km', 'ndbi_Mean_5km', 'ndbi_Std_5km', 'ndbsi_Mean_5km', 'ndbsi_Std_5km',
    'ndsi_Mean_5km', 'ndsi_Std_5km', 'ndvi_Mean_5km', 'ndvi_Std_5km', 'ndwi_Mean_5km', 'ndwi_Std_5km',
    'savi_Mean_5km', 'savi_Std_5km', 'ui_Mean_5km', 'ui_Std_5km'
]
X_gnn_df = df[gnn_feature_cols]

# Scale MLP and GNN features
mlp_scaler = StandardScaler()
X_mlp_input = mlp_scaler.fit_transform(X_mlp_df)

gnn_scaler = StandardScaler()
X_gnn_node_features = gnn_scaler.fit_transform(X_gnn_df)

X_cnn_input = prepare_cnn_input(df, raster_base_path, patch_size)

Successfully loaded Combined_Features_Rainy.csv

Preparing CNN input...
CNN Input shape: (17, 0, 0, 17)


In [29]:
# Prepare GNN input using the specific features and location data
X_gnn_node_features, adjacency_matrix_sparse = prepare_gnn_input(
    df,
    node_features=X_gnn_node_features,
    distance_threshold=gnn_distance_threshold
)

# Split all data into training and testing sets
X_mlp_train, X_mlp_test, y_train, y_test = train_test_split(
    X_mlp_input, y_target, test_size=0.2, random_state=42
)
X_cnn_train, X_cnn_test, _, _ = train_test_split(
    X_cnn_input, y_target, test_size=0.2, random_state=42
)
# GNN node features for training and testing
X_gnn_train_nodes, X_gnn_test_nodes, _, _ = train_test_split(
    X_gnn_node_features, y_target, test_size=0.2, random_state=42
)
# The adjacency matrix is the same for both train and test sets, as it's static
X_gnn_train_adj = adjacency_matrix_sparse
X_gnn_test_adj = adjacency_matrix_sparse



Preparing GNN input...
GNN graph data prepared.


In [30]:
# --- 4. Building the Multi-Fused Model ---

# Define input layers
mlp_input = layers.Input(shape=(X_mlp_train.shape[1],), name='mlp_input')
cnn_input = layers.Input(shape=(X_cnn_train.shape[1], X_cnn_train.shape[2], X_cnn_train.shape[3]), name='cnn_input')
gnn_input_nodes = layers.Input(shape=(X_gnn_train_nodes.shape[1],), name='gnn_input_nodes')
gnn_input_adj = layers.Input(shape=(X_gnn_train_nodes.shape[0],), name='gnn_input_adj', sparse=True)

# MLP Branch
mlp_branch = layers.Dense(64, activation='relu')(mlp_input)
mlp_branch = layers.Dropout(0.3)(mlp_branch)
mlp_branch_output = layers.Dense(32, activation='relu', name='mlp_encoder_output')(mlp_branch)

# CNN Branch
cnn_branch = layers.Conv2D(32, (3, 3), activation='relu')(cnn_input)
cnn_branch = layers.MaxPooling2D((2, 2))(cnn_branch)
cnn_branch = layers.Conv2D(64, (3, 3), activation='relu')(cnn_branch)
cnn_branch = layers.MaxPooling2D((2, 2))(cnn_branch)
cnn_branch = layers.Flatten()(cnn_branch)
cnn_branch_output = layers.Dense(32, activation='relu', name='cnn_encoder_output')(cnn_branch)

# GNN Branch
gnn_branch = CustomGraphConv(32, name='gnn_encoder_output')([gnn_input_nodes, gnn_input_adj])
train_node_indices = np.array(range(len(X_mlp_train)))
gnn_branch_output = layers.Lambda(lambda x: tf.gather(x, train_node_indices), name='gnn_output_train')(gnn_branch)

# Fusion Layer
combined_features = layers.concatenate([mlp_branch_output, cnn_branch_output, gnn_branch_output])
fusion_layer = layers.Dense(64, activation='relu')(combined_features)
fusion_layer = layers.Dropout(0.3)(fusion_layer)
output_layer = layers.Dense(1, activation='linear', name='final_prediction')(fusion_layer)

# Define the full model
model = Model(
    inputs=[mlp_input, cnn_input, gnn_input_nodes, gnn_input_adj],
    outputs=output_layer,
    name='CNN_GNN_MLP_Model'
)

ValueError: Computed output size would be negative. Received `inputs shape=(None, 0, 0, 17)`, `kernel shape=(3, 3, 17, 32)`, `dilation_rate=[1 1]`.

In [36]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import os
import rasterio
from rasterio.transform import Affine
from shapely.geometry import Point
from glob import glob
from rasterio.windows import Window
import logging

# Set up logging to avoid spamming the console with warnings
logging.basicConfig(level=logging.INFO)
# Suppress specific rasterio warnings
logging.getLogger('rasterio._base').setLevel(logging.ERROR)

# --- 1. Data Loading and Preparation Functions ---

def load_combined_features(file_path):
    """
    Loads the main tabular data from the Combined_Features_Rainy.csv file.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded {file_path}")
        return df
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please ensure the path is correct.")
        return None

def extract_raster_patch(raster_src, point_geom, patch_size):
    """
    Extracts a square image patch around a point for CNN input using rasterio.windows.
    Handles boundary conditions by filling with a constant value.
    """
    # Initialize an empty patch with a constant value (e.g., 0)
    patch = np.full((patch_size, patch_size), raster_src.nodata or 0, dtype=raster_src.dtypes[0])

    # Get pixel coordinates from georeferenced coordinates
    try:
        col, row = raster_src.index(point_geom.x, point_geom.y)
    except Exception:
        # If the point is outside the raster extent, return the zero-filled patch
        return patch

    half_patch = patch_size // 2
    
    # Define the window to read
    window_read = Window(col - half_patch, row - half_patch, patch_size, patch_size)
    
    try:
        # Read the window, filling any parts outside the raster with nodata value
        data = raster_src.read(1, window=window_read, boundless=True, fill_value=raster_src.nodata)
        
        # Check if the read data has the expected shape and resize if necessary
        if data.shape == (patch_size, patch_size):
            patch = data
        else:
            # If the shape is incorrect (e.g., at the very edge), we resize it.
            new_patch = np.full((patch_size, patch_size), raster_src.nodata or 0, dtype=raster_src.dtypes[0])
            rows, cols = data.shape
            new_patch[:rows, :cols] = data
            patch = new_patch
            
    except Exception as e:
        print(f"Error reading patch for point {point_geom}. Returning zero-filled patch. Error: {e}")
        return patch
    
    return patch

def prepare_cnn_input(df, raster_base_path, patch_size=32):
    """
    Loads all rasters and creates a stacked, multi-channel input for the CNN.
    """
    print("\nPreparing CNN input...")
    
    # Get all .tif files from the specified directories
    indices_files = sorted(glob(os.path.join(raster_base_path, 'CalIndices', '*.tif')))
    lulc_files = sorted(glob(os.path.join(raster_base_path, 'LULCMerged', '*.tif')))
    
    all_raster_files = indices_files + lulc_files
    
    if not all_raster_files:
        raise FileNotFoundError("No .tif files found in Raster/CalIndices or Raster/LULCMerged.")

    # Extract patches for each station and stack them
    cnn_input_list = []
    for index, row in df.iterrows():
        point_geom = Point(row['Long'], row['Lat'])
        patches = []
        for file_path in all_raster_files:
            try:
                with rasterio.open(file_path) as src:
                    patch = extract_raster_patch(src, point_geom, patch_size)
                    patches.append(patch)
            except Exception as e:
                print(f"Failed to process {file_path}. Skipping. Error: {e}")
                patches.append(np.zeros((patch_size, patch_size)))

        if patches:
            stacked_patches = np.stack(patches, axis=-1)
            stacked_patches[np.isnan(stacked_patches)] = 0.0 # Handle NaNs
            cnn_input_list.append(stacked_patches)
    
    if not cnn_input_list:
        raise ValueError("No valid CNN input patches could be created.")
    
    cnn_input_array = np.array(cnn_input_list)
    print(f"CNN Input shape: {cnn_input_array.shape}")
    return cnn_input_array

def prepare_gnn_input(df, node_features, distance_threshold=5000):
    """
    Prepares the graph data (adjacency matrix and node features) for the GNN.
    The graph is created based on Euclidean distance between stations.
    """
    print("\nPreparing GNN input...")
    num_samples = len(df)
    
    # Create a distance-based adjacency matrix using the sampling locations
    coords = df[['Long', 'Lat']].values
    # We use a simple Euclidean distance as an approximation for this example
    dist_matrix = np.sqrt(np.sum((coords[:, np.newaxis] - coords[np.newaxis, :])**2, axis=2))
    
    # Create adjacency matrix based on a distance threshold (e.g., 5km)
    adjacency_matrix = (dist_matrix < distance_threshold).astype(np.float32) # Cast to float32
    np.fill_diagonal(adjacency_matrix, 0)
    
    # Convert to sparse tensor for efficient GNN processing
    adjacency_matrix_sparse = tf.sparse.from_dense(adjacency_matrix)
    print("GNN graph data prepared.")
    return node_features, adjacency_matrix_sparse

# --- 2. Custom GNN Layer ---
# A simplified graph convolutional layer for neighborhood aggregation.
class CustomGraphConv(layers.Layer):
    def __init__(self, units, activation='relu', **kwargs):
        super(CustomGraphConv, self).__init__(**kwargs)
        self.units = units
        self.activation = tf.keras.activations.get(activation)

    def build(self, input_shape):
        # input_shape is a list of shapes for the inputs
        # input_shape[0] is the shape of the node features (None, num_features)
        num_features = input_shape[0][-1] 
        self.kernel = self.add_weight(
            shape=(num_features, self.units),
            initializer="glorot_uniform",
            trainable=True,
            name="kernel"
        )
        self.bias = self.add_weight(
            shape=(self.units,),
            initializer="zeros",
            trainable=True,
            name="bias"
        )
        super(CustomGraphConv, self).build(input_shape)

    def call(self, inputs):
        # inputs are now expected to be concrete tensors (not KerasTensors) with no batch dimension
        features, adj_matrix = inputs
        # The tf.sparse.sparse_dense_matmul expects rank 2 tensors
        aggregated_features = tf.sparse.sparse_dense_matmul(adj_matrix, features)
        output = tf.matmul(aggregated_features, self.kernel) + self.bias
        return self.activation(output)

    def get_config(self):
        config = super(CustomGraphConv, self).get_config()
        config.update({
            "units": self.units,
            "activation": tf.keras.activations.serialize(self.activation),
        })
        return config

# --- 3. Main Script Execution ---

if __name__ == "__main__":
    # --- Data Paths and Parameters ---
    csv_path = 'Combined_Features_Rainy.csv'
    raster_base_path = 'Raster'
    target_column = 'AsR'
    patch_size = 32
    gnn_distance_threshold = 5000  # meters

    # Load and prepare data
    df = load_combined_features(csv_path)
    if df is None:
        exit()

    num_nodes = len(df) # Total number of nodes in the graph
    
    # Define features and target
    y_target = df[target_column].values.reshape(-1, 1)
    
    # MLP features: all columns except metadata and targets
    mlp_feature_cols = [col for col in df.columns if col not in ['Stations', 'River', 'Lat', 'Long', 'CrR', 'NiR', 'CuR', 'AsR', 'CdR', 'PbR', 'FeR']]
    X_mlp_df = df[mlp_feature_cols]
    
    # GNN features: hydrological and indices columns
    gnn_feature_cols = [
        'variation_2017-2018', 'variation_2018-2019', 'variation_2019-2020', 'variation_2020-2021', 'variation_2021-2022',
        'dem_point_value', 'dem_mean_5km', 'dem_std_5km', 'dist_to_nearest_BF', 'dist_to_nearest_IND',
        'num_within_5km_BF', 'num_within_5km_IND',
        'awei_Mean_5km', 'awei_Std_5km', 'bui_Mean_5km', 'bui_Std_5km', 'evi_Mean_5km', 'evi_Std_5km',
        'mndwi_Mean_5km', 'mndwi_Std_5km', 'ndbi_Mean_5km', 'ndbi_Std_5km', 'ndbsi_Mean_5km', 'ndbsi_Std_5km',
        'ndsi_Mean_5km', 'ndsi_Std_5km', 'ndvi_Mean_5km', 'ndvi_Std_5km', 'ndwi_Mean_5km', 'ndwi_Std_5km',
        'savi_Mean_5km', 'savi_Std_5km', 'ui_Mean_5km', 'ui_Std_5km'
    ]
    X_gnn_df = df[gnn_feature_cols]

    # Scale MLP and GNN features
    mlp_scaler = StandardScaler()
    X_mlp_input = mlp_scaler.fit_transform(X_mlp_df)
    
    gnn_scaler = StandardScaler()
    X_gnn_node_features = gnn_scaler.fit_transform(X_gnn_df)
    
    # Corrected CNN input preparation
    X_cnn_input = prepare_cnn_input(df, raster_base_path, patch_size)
    
    # Prepare GNN input using the specific features and location data
    X_gnn_node_features_tensor, adjacency_matrix_sparse_tensor = prepare_gnn_input(
        df,
        node_features=X_gnn_node_features,
        distance_threshold=gnn_distance_threshold
    )
    
    # --- GNN Encoding Step ---
    # This step is performed once on the full, non-batched graph data.
    print("\n--- Running GNN Encoder on the full graph ---")
    gnn_encoder_layer = CustomGraphConv(32)
    # The `build` method needs to be called manually since we are not using a Model.
    gnn_encoder_layer.build([X_gnn_node_features_tensor.shape, adjacency_matrix_sparse_tensor.shape])
    gnn_outputs_full_graph = gnn_encoder_layer.call([tf.constant(X_gnn_node_features_tensor, dtype=tf.float32), adjacency_matrix_sparse_tensor])
    print(f"GNN full graph output shape: {gnn_outputs_full_graph.shape}")

    # --- Data Splitting and Slicing ---
    # Split all data into training and testing sets.
    # Note: We split the indices and use them to slice the data.
    all_indices = np.arange(num_nodes)
    train_indices, test_indices, _, _ = train_test_split(
        all_indices, y_target, test_size=0.2, random_state=42
    )

    X_mlp_train = X_mlp_input[train_indices]
    X_mlp_test = X_mlp_input[test_indices]
    X_cnn_train = X_cnn_input[train_indices]
    X_cnn_test = X_cnn_input[test_indices]
    y_train = y_target[train_indices]
    y_test = y_target[test_indices]

    # Slice the pre-computed GNN outputs for the training and test sets
    gnn_train_output = gnn_outputs_full_graph.numpy()[train_indices]
    gnn_test_output = gnn_outputs_full_graph.numpy()[test_indices]
    
    # --- 4. Building the Main Fusion Model ---
    
    # This model takes batched inputs for MLP and CNN, and the sliced GNN output.
    mlp_input = layers.Input(shape=(X_mlp_input.shape[1],), name='mlp_input')
    cnn_input = layers.Input(shape=(X_cnn_input.shape[1], X_cnn_input.shape[2], X_cnn_input.shape[3]), name='cnn_input')
    gnn_output_for_batch = layers.Input(shape=(32,), name='gnn_output_for_batch') # 32 is the GNN units

    # MLP Branch
    mlp_branch = layers.Dense(64, activation='relu')(mlp_input)
    mlp_branch = layers.Dropout(0.3)(mlp_branch)
    mlp_branch_output = layers.Dense(32, activation='relu', name='mlp_encoder_output')(mlp_branch)
    
    # CNN Branch
    cnn_branch = layers.Conv2D(32, (3, 3), activation='relu')(cnn_input)
    cnn_branch = layers.MaxPooling2D((2, 2))(cnn_branch)
    cnn_branch = layers.Conv2D(64, (3, 3), activation='relu')(cnn_branch)
    cnn_branch = layers.MaxPooling2D((2, 2))(cnn_branch)
    cnn_branch = layers.Flatten()(cnn_branch)
    cnn_branch_output = layers.Dense(32, activation='relu', name='cnn_encoder_output')(cnn_branch)

    # Fusion Layer
    combined_features = layers.concatenate([mlp_branch_output, cnn_branch_output, gnn_output_for_batch])
    fusion_layer = layers.Dense(64, activation='relu')(combined_features)
    fusion_layer = layers.Dropout(0.3)(fusion_layer)
    output_layer = layers.Dense(1, activation='linear', name='final_prediction')(fusion_layer)
    
    fusion_model = Model(
        inputs=[mlp_input, cnn_input, gnn_output_for_batch],
        outputs=output_layer,
        name='Fusion_Model'
    )

    # --- 5. Compile and Train the Model ---
    fusion_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    
    print("\n--- Fusion Model Summary ---")
    print(fusion_model.summary())

    # Now we train the fusion model with the correctly batched inputs
    train_inputs = [X_mlp_train, X_cnn_train, gnn_train_output]
    
    print("\n--- Training the Fused Model ---")
    history = fusion_model.fit(
        train_inputs,
        y_train,
        epochs=50,
        batch_size=16,
        verbose=2
    )

    # --- 6. Evaluation ---
    print("\n--- Evaluating the Model on Test Data ---")
    test_inputs = [X_mlp_test, X_cnn_test, gnn_test_output]
    y_pred = fusion_model.predict(test_inputs)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"R-squared: {r2:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")


Successfully loaded Combined_Features_Rainy.csv

Preparing CNN input...
CNN Input shape: (17, 32, 32, 17)

Preparing GNN input...
GNN graph data prepared.

--- Running GNN Encoder on the full graph ---
GNN full graph output shape: (17, 32)

--- Fusion Model Summary ---


None

--- Training the Fused Model ---
Epoch 1/50
1/1 - 1s - 1s/step - loss: 224.6854 - mae: 14.1572
Epoch 2/50
1/1 - 0s - 23ms/step - loss: 222.0718 - mae: 14.0579
Epoch 3/50
1/1 - 0s - 21ms/step - loss: 215.6759 - mae: 13.8507
Epoch 4/50
1/1 - 0s - 27ms/step - loss: 210.0634 - mae: 13.6386
Epoch 5/50
1/1 - 0s - 22ms/step - loss: 206.7099 - mae: 13.4269
Epoch 6/50
1/1 - 0s - 23ms/step - loss: 201.9163 - mae: 13.3596
Epoch 7/50
1/1 - 0s - 22ms/step - loss: 207.4121 - mae: 13.4599
Epoch 8/50
1/1 - 0s - 24ms/step - loss: 197.1828 - mae: 13.0887
Epoch 9/50
1/1 - 0s - 22ms/step - loss: 196.4373 - mae: 13.1219
Epoch 10/50
1/1 - 0s - 23ms/step - loss: 195.1902 - mae: 13.0914
Epoch 11/50
1/1 - 0s - 22ms/step - loss: 193.9972 - mae: 12.9646
Epoch 12/50
1/1 - 0s - 24ms/step - loss: 186.4460 - mae: 12.7409
Epoch 13/50
1/1 - 0s - 22ms/step - loss: 182.1155 - mae: 12.5187
Epoch 14/50
1/1 - 0s - 29ms/step - loss: 176.5131 - mae: 12.2794
Epoch 15/50
1/1 - 0s - 23ms/step - loss: 179.6440 - mae: 12.37

In [37]:
# Now we train the fusion model with the correctly batched inputs
train_inputs = [X_mlp_train, X_cnn_train, gnn_train_output]

print("\n--- Training the Fused Model ---")
history = fusion_model.fit(
    train_inputs,
    y_train,
    epochs=100,
    batch_size=16,
    verbose=2
)

# --- 6. Evaluation ---
print("\n--- Evaluating the Model on Test Data ---")
test_inputs = [X_mlp_test, X_cnn_test, gnn_test_output]
y_pred = fusion_model.predict(test_inputs)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R-squared: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")


--- Training the Fused Model ---
Epoch 1/100
1/1 - 0s - 38ms/step - loss: 22.1592 - mae: 4.0710
Epoch 2/100
1/1 - 0s - 27ms/step - loss: 31.1077 - mae: 4.6858
Epoch 3/100
1/1 - 0s - 25ms/step - loss: 21.1776 - mae: 4.0621
Epoch 4/100
1/1 - 0s - 25ms/step - loss: 32.8325 - mae: 5.0469
Epoch 5/100
1/1 - 0s - 24ms/step - loss: 30.1702 - mae: 4.8398
Epoch 6/100
1/1 - 0s - 23ms/step - loss: 23.8372 - mae: 4.2356
Epoch 7/100
1/1 - 0s - 24ms/step - loss: 26.8431 - mae: 4.4767
Epoch 8/100
1/1 - 0s - 25ms/step - loss: 11.2028 - mae: 2.8276
Epoch 9/100
1/1 - 0s - 24ms/step - loss: 19.8303 - mae: 3.5380
Epoch 10/100
1/1 - 0s - 25ms/step - loss: 15.0831 - mae: 3.1395
Epoch 11/100
1/1 - 0s - 23ms/step - loss: 5.3190 - mae: 1.8403
Epoch 12/100
1/1 - 0s - 25ms/step - loss: 18.2786 - mae: 3.5139
Epoch 13/100
1/1 - 0s - 64ms/step - loss: 11.0948 - mae: 2.7391
Epoch 14/100
1/1 - 0s - 23ms/step - loss: 16.7505 - mae: 3.5256
Epoch 15/100
1/1 - 0s - 23ms/step - loss: 19.5893 - mae: 4.0080
Epoch 16/100
1/1

## K-Fold Cross Validation

In [38]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout

# It's important to assume the data preparation steps from the user's previous output are complete.
# We will create placeholder data that matches the shapes from the user's output.
# In a real scenario, this data would come from your pre-processing script.

# Placeholder data matching the user's output shapes
# Total samples are 17, as per the user's log.
num_samples = 17

# CNN Input: (17, 32, 32, 17)
# This would be the raster data for each of the 17 stations.
cnn_input_data = np.random.rand(num_samples, 32, 32, 17)

# MLP Input: (17, 34)
# This would be the tabular data for each of the 17 stations.
mlp_input_data = np.random.rand(num_samples, 34)

# GNN Output: (17, 32)
# This is the output from the GNN encoder for all 17 stations.
gnn_output_data = np.random.rand(num_samples, 32)

# Target variable (e.g., heavy metal concentration)
# This is the value we are trying to predict.
y = np.random.rand(num_samples, 1) * 20 + 5 # Create some random target values for demonstration

# Define the model architecture. This is a factory function so we can create a new model
# for each fold, preventing data leakage.
def create_fusion_model():
    """
    Creates and compiles the CNN-GNN-MLP fusion model.
    """
    # CNN branch
    cnn_input = Input(shape=(32, 32, 17), name='cnn_input')
    conv1 = Conv2D(32, kernel_size=(3, 3), activation='relu')(cnn_input)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
    conv2 = Conv2D(64, kernel_size=(3, 3), activation='relu')(pool1)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
    flatten = Flatten()(pool2)
    cnn_encoder_output = Dense(32, activation='relu', name='cnn_encoder_output')(flatten)

    # MLP branch
    mlp_input = Input(shape=(34,), name='mlp_input')
    dense1 = Dense(64, activation='relu')(mlp_input)
    dropout1 = Dropout(0.2)(dense1)
    mlp_encoder_output = Dense(32, activation='relu', name='mlp_encoder_output')(dropout1)

    # GNN output branch (the output from the pre-computed GNN encoder)
    gnn_output_for_batch = Input(shape=(32,), name='gnn_output_for_batch')

    # Fusion of outputs
    combined = Concatenate(axis=-1)([mlp_encoder_output, cnn_encoder_output, gnn_output_for_batch])

    # Fusion MLP head
    dense2 = Dense(64, activation='relu')(combined)
    dropout2 = Dropout(0.2)(dense2)
    final_prediction = Dense(1, name='final_prediction')(dropout2)

    # Create the model
    model = Model(inputs=[cnn_input, mlp_input, gnn_output_for_batch], outputs=final_prediction, name='Fusion_Model')
    
    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    
    return model

# Set up k-fold cross-validation
n_splits = 5  # Using 5 folds
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store the scores from each fold
r2_scores = []
mse_scores = []

print(f"--- Starting {n_splits}-fold Cross-Validation ---")

# The K-Fold loop
for fold, (train_index, val_index) in enumerate(kf.split(cnn_input_data)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")

    # Split the data for the current fold
    X_cnn_train, X_cnn_val = cnn_input_data[train_index], cnn_input_data[val_index]
    X_mlp_train, X_mlp_val = mlp_input_data[train_index], mlp_input_data[val_index]
    X_gnn_train, X_gnn_val = gnn_output_data[train_index], gnn_output_data[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Create a fresh instance of the model for this fold
    model = create_fusion_model()

    # Train the model
    # Use a small number of epochs for demonstration given the small dataset
    history = model.fit(
        [X_cnn_train, X_mlp_train, X_gnn_train],
        y_train,
        epochs=50,
        batch_size=1, # batch_size=1 is necessary for small datasets
        verbose=0 # Turn off verbose output to keep the log clean
    )

    # Evaluate the model on the validation data for this fold
    predictions = model.predict([X_cnn_val, X_mlp_val, X_gnn_val])
    
    # Calculate performance metrics
    r2 = r2_score(y_val, predictions)
    mse = mean_squared_error(y_val, predictions)
    
    print(f"Fold {fold + 1} Evaluation:")
    print(f"  R-squared: {r2:.4f}")
    print(f"  Mean Squared Error: {mse:.4f}")
    
    # Store the scores
    r2_scores.append(r2)
    mse_scores.append(mse)

# Calculate and print the final average scores
print("\n--- Cross-Validation Complete ---")
print(f"Average R-squared across {n_splits} folds: {np.mean(r2_scores):.4f} +/- {np.std(r2_scores):.4f}")
print(f"Average Mean Squared Error across {n_splits} folds: {np.mean(mse_scores):.4f} +/- {np.std(mse_scores):.4f}")


--- Starting 5-fold Cross-Validation ---

--- Fold 1/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Fold 1 Evaluation:
  R-squared: -0.4008
  Mean Squared Error: 53.2645

--- Fold 2/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Fold 2 Evaluation:
  R-squared: -0.0553
  Mean Squared Error: 45.9918

--- Fold 3/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Fold 3 Evaluation:
  R-squared: -0.4446
  Mean Squared Error: 30.8905

--- Fold 4/5 ---




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Fold 4 Evaluation:
  R-squared: 0.0102
  Mean Squared Error: 58.2206

--- Fold 5/5 ---




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Fold 5 Evaluation:
  R-squared: -0.7775
  Mean Squared Error: 35.0571

--- Cross-Validation Complete ---
Average R-squared across 5 folds: -0.3336 +/- 0.2862
Average Mean Squared Error across 5 folds: 44.6849 +/- 10.4068


In [39]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, Flatten

# Re-using the placeholder data from the previous run.
num_samples = 17
cnn_input_data = np.random.rand(num_samples, 32, 32, 17)
mlp_input_data = np.random.rand(num_samples, 34)
gnn_output_data = np.random.rand(num_samples, 32)
y = np.random.rand(num_samples, 1) * 20 + 5

# Define the new, simplified model architecture. This factory function
# will be called for each fold.
def create_simplified_fusion_model():
    """
    Creates a simplified fusion model with fewer parameters to prevent overfitting
    on a small dataset.
    """
    # CNN branch (simplified to just a flatten and a dense layer)
    cnn_input = Input(shape=(32, 32, 17), name='cnn_input')
    flatten_cnn = Flatten()(cnn_input)
    cnn_output_simplified = Dense(16, activation='relu', name='cnn_simplified_output')(flatten_cnn)

    # MLP branch
    mlp_input = Input(shape=(34,), name='mlp_input')
    mlp_encoder_output = Dense(16, activation='relu', name='mlp_encoder_output')(mlp_input)

    # GNN output branch
    gnn_output_for_batch = Input(shape=(32,), name='gnn_output_for_batch')

    # Fusion of all three branches
    combined = Concatenate(axis=-1)([mlp_encoder_output, cnn_output_simplified, gnn_output_for_batch])

    # Simplified fusion MLP head with fewer neurons
    dense_combined = Dense(32, activation='relu')(combined)
    final_prediction = Dense(1, name='final_prediction')(dense_combined)

    # Create the model
    model = Model(inputs=[cnn_input, mlp_input, gnn_output_for_batch], outputs=final_prediction, name='Simplified_Fusion_Model')
    
    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    
    return model

# Set up k-fold cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store the scores from each fold
r2_scores = []
mse_scores = []

print(f"--- Starting {n_splits}-fold Cross-Validation with Simplified Model ---")

# The K-Fold loop
for fold, (train_index, val_index) in enumerate(kf.split(cnn_input_data)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")

    # Split the data for the current fold
    X_cnn_train, X_cnn_val = cnn_input_data[train_index], cnn_input_data[val_index]
    X_mlp_train, X_mlp_val = mlp_input_data[train_index], mlp_input_data[val_index]
    X_gnn_train, X_gnn_val = gnn_output_data[train_index], gnn_output_data[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Create a fresh instance of the SIMPLIFIED model for this fold
    model = create_simplified_fusion_model()

    # Train the model
    # Fewer epochs are used here as well to prevent overfitting
    history = model.fit(
        [X_cnn_train, X_mlp_train, X_gnn_train],
        y_train,
        epochs=30,
        batch_size=1,
        verbose=0
    )

    # Evaluate the model on the validation data for this fold
    predictions = model.predict([X_cnn_val, X_mlp_val, X_gnn_val])
    
    # Calculate performance metrics
    r2 = r2_score(y_val, predictions)
    mse = mean_squared_error(y_val, predictions)
    
    print(f"Fold {fold + 1} Evaluation:")
    print(f"  R-squared: {r2:.4f}")
    print(f"  Mean Squared Error: {mse:.4f}")
    
    # Store the scores
    r2_scores.append(r2)
    mse_scores.append(mse)

# Calculate and print the final average scores
print("\n--- Cross-Validation Complete ---")
print(f"Average R-squared across {n_splits} folds: {np.mean(r2_scores):.4f} +/- {np.std(r2_scores):.4f}")
print(f"Average Mean Squared Error across {n_splits} folds: {np.mean(mse_scores):.4f} +/- {np.std(mse_scores):.4f}")


--- Starting 5-fold Cross-Validation with Simplified Model ---

--- Fold 1/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Fold 1 Evaluation:
  R-squared: 0.0317
  Mean Squared Error: 40.6759

--- Fold 2/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Fold 2 Evaluation:
  R-squared: -2.0545
  Mean Squared Error: 49.7319

--- Fold 3/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Fold 3 Evaluation:
  R-squared: -0.3538
  Mean Squared Error: 16.8414

--- Fold 4/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Fold 4 Evaluation:
  R-squared: 0.0275
  Mean Squared Error: 43.8860

--- Fold 5/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Fold 5 Evaluation:
  R-squared: -0.4175
  Mean Squared Error: 1.9751

--- Cross-Validation Complete ---
Average R-squared across 5 folds: -0.5533 +/- 0.7735
Average Mean Squared Error across 5 folds: 30.6220 +

In [40]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

# Placeholder for the dataset. We'll use the same dummy data as before
# for this demonstration. In a real-world scenario, you would load your
# actual LULC, Hydrological, and chemical data here.
num_samples = 17
cnn_input_data = np.random.rand(num_samples, 32, 32, 17) # Represents LULC variations
mlp_input_data = np.random.rand(num_samples, 34) # Represents Hydrological Properties
gnn_output_data = np.random.rand(num_samples, 32) # Represents GNN embeddings
y = np.random.rand(num_samples, 1) * 20 + 5 # The target heavy metal concentration

def create_simplified_fusion_model():
    """
    Creates a simplified fusion model with fewer parameters to prevent overfitting
    on a small dataset.
    """
    # CNN branch for LULC data
    cnn_input = Input(shape=(32, 32, 17), name='cnn_input')
    flatten_cnn = Flatten()(cnn_input)
    cnn_output_simplified = Dense(16, activation='relu', name='cnn_simplified_output')(flatten_cnn)

    # MLP branch for Hydrological Properties
    mlp_input = Input(shape=(34,), name='mlp_input')
    mlp_encoder_output = Dense(16, activation='relu', name='mlp_encoder_output')(mlp_input)

    # GNN output branch for spatial network features
    gnn_output_for_batch = Input(shape=(32,), name='gnn_output_for_batch')

    # Fusion of all three branches
    combined = Concatenate(axis=-1)([mlp_encoder_output, cnn_output_simplified, gnn_output_for_batch])

    # Simplified fusion MLP head
    dense_combined = Dense(32, activation='relu')(combined)
    final_prediction = Dense(1, name='final_prediction')(dense_combined)

    # Create the model
    model = Model(inputs=[cnn_input, mlp_input, gnn_output_for_batch], outputs=final_prediction, name='Simplified_Fusion_Model')

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

    return model

def analyze_feature_importance(model, cnn_input, mlp_input, gnn_output, y):
    """
    Analyzes the relative importance of each input branch (CNN, MLP, GNN)
    by training a simple linear model on the latent features.
    
    Args:
        model: The trained deep learning model.
        cnn_input: The input data for the CNN branch.
        mlp_input: The input data for the MLP branch.
        gnn_output: The input data for the GNN output branch.
        y: The target values.

    Returns:
        A dictionary with the percentage importance of each branch.
    """
    # Extract the latent feature models
    latent_cnn_model = Model(inputs=model.input, outputs=model.get_layer('cnn_simplified_output').output)
    latent_mlp_model = Model(inputs=model.input, outputs=model.get_layer('mlp_encoder_output').output)
    
    # Get the latent features from the trained model
    latent_cnn_features = latent_cnn_model.predict([cnn_input, mlp_input, gnn_output])
    latent_mlp_features = latent_mlp_model.predict([cnn_input, mlp_input, gnn_output])
    latent_gnn_features = gnn_output  # GNN output is already a latent feature

    # Combine the latent features
    X_latent = np.concatenate([latent_cnn_features, latent_mlp_features, latent_gnn_features], axis=-1)

    # Define a simple linear model to find the importance of each feature group
    # We will use this to get the weights and interpret them as importance scores.
    input_latent = Input(shape=(X_latent.shape[1],))
    output_latent = Dense(1, activation='linear')(input_latent)
    importance_model = Model(inputs=input_latent, outputs=output_latent)
    
    importance_model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')
    
    # Train the importance model to get the weights
    importance_model.fit(X_latent, y, epochs=100, verbose=0)
    
    # Extract the weights from the trained linear model
    weights = importance_model.get_weights()[0].flatten()

    # Calculate the total importance for each branch based on the weights
    # The weights for each branch correspond to the size of its latent space
    cnn_weight_sum = np.sum(np.abs(weights[:latent_cnn_features.shape[1]]))
    mlp_weight_sum = np.sum(np.abs(weights[latent_cnn_features.shape[1] : latent_cnn_features.shape[1] + latent_mlp_features.shape[1]]))
    gnn_weight_sum = np.sum(np.abs(weights[latent_cnn_features.shape[1] + latent_mlp_features.shape[1]:]))

    total_weight_sum = cnn_weight_sum + mlp_weight_sum + gnn_weight_sum

    # Calculate the percentage contribution of each branch
    if total_weight_sum > 0:
        cnn_importance = (cnn_weight_sum / total_weight_sum) * 100
        mlp_importance = (mlp_weight_sum / total_weight_sum) * 100
        gnn_importance = (gnn_weight_sum / total_weight_sum) * 100
    else:
        cnn_importance = mlp_importance = gnn_importance = 0
        
    return {
        "CNN_Importance": cnn_importance,
        "MLP_Importance": mlp_importance,
        "GNN_Importance": gnn_importance
    }


# Set up k-fold cross-validation (same as before)
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

all_importance_scores = []

print(f"--- Starting {n_splits}-fold Cross-Validation for Source Apportionment ---")

for fold, (train_index, val_index) in enumerate(kf.split(cnn_input_data)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")

    X_cnn_train, X_cnn_val = cnn_input_data[train_index], cnn_input_data[val_index]
    X_mlp_train, X_mlp_val = mlp_input_data[train_index], mlp_input_data[val_index]
    X_gnn_train, X_gnn_val = gnn_output_data[train_index], gnn_output_data[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Create and train a fresh model for this fold
    model = create_simplified_fusion_model()
    model.fit(
        [X_cnn_train, X_mlp_train, X_gnn_train],
        y_train,
        epochs=30,
        batch_size=1,
        verbose=0
    )

    # Analyze the feature importance for this fold's model
    importance_scores = analyze_feature_importance(model, X_cnn_val, X_mlp_val, X_gnn_val, y_val)
    all_importance_scores.append(importance_scores)
    
    print("Source Apportionment for this Fold:")
    for key, value in importance_scores.items():
        print(f"  {key}: {value:.2f}%")

# Calculate and print the final average scores
print("\n--- Cross-Validation Complete ---")
avg_cnn_importance = np.mean([s['CNN_Importance'] for s in all_importance_scores])
avg_mlp_importance = np.mean([s['MLP_Importance'] for s in all_importance_scores])
avg_gnn_importance = np.mean([s['GNN_Importance'] for s in all_importance_scores])

print("Average Source Apportionment across all 5 folds:")
print(f"  CNN (LULC Data): {avg_cnn_importance:.2f}%")
print(f"  MLP (Hydrological Data): {avg_mlp_importance:.2f}%")
print(f"  GNN (Spatial Network Data): {avg_gnn_importance:.2f}%")

--- Starting 5-fold Cross-Validation for Source Apportionment ---

--- Fold 1/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Source Apportionment for this Fold:
  CNN_Importance: 22.22%
  MLP_Importance: 22.87%
  GNN_Importance: 54.91%

--- Fold 2/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Source Apportionment for this Fold:
  CNN_Importance: 25.31%
  MLP_Importance: 24.93%
  GNN_Importance: 49.76%

--- Fold 3/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Source Apportionment for this Fold:
  CNN_Importance: 17.87%
  MLP_Importance: 24.27%
  GNN_Importance: 57.87%

--- Fold 4/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━

In [41]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

# --- 1. Enhanced Placeholder Dataset with Specific Variables ---
num_samples = 17

# CNN input data: LULC variations. No change from the previous version.
cnn_input_data = np.random.rand(num_samples, 32, 32, 17)

# GNN output data: GNN embeddings. No change from the previous version.
gnn_output_data = np.random.rand(num_samples, 32)

# MLP input data: Now a pandas DataFrame with specific variable names.
# This makes the variables explicit and easier to analyze.
mlp_data_columns = [
    'distance_from_factories',
    'distance_from_brick_fields',
    'hydrological_index_1',
    'hydrological_index_2',
    'land_use_index',
    'soil_type_index'
]
# Create a dummy DataFrame with more specific and varied data.
mlp_input_data = pd.DataFrame(np.random.rand(num_samples, len(mlp_data_columns)), columns=mlp_data_columns)

# The target heavy metal concentration
y = np.random.rand(num_samples, 1) * 20 + 5

# --- 2. Simplified Fusion Model (Same as before) ---
def create_simplified_fusion_model():
    """
    Creates a simplified fusion model with a clear structure to aid interpretability.
    """
    # CNN branch for LULC data
    cnn_input = Input(shape=(32, 32, 17), name='cnn_input')
    flatten_cnn = Flatten()(cnn_input)
    cnn_output_simplified = Dense(16, activation='relu', name='cnn_simplified_output')(flatten_cnn)

    # MLP branch for Hydrological Properties (now with a defined input size)
    mlp_input = Input(shape=(mlp_input_data.shape[1],), name='mlp_input')
    mlp_encoder_output = Dense(16, activation='relu', name='mlp_encoder_output')(mlp_input)

    # GNN output branch for spatial network features
    gnn_output_for_batch = Input(shape=(32,), name='gnn_output_for_batch')

    # Fusion of all three branches
    combined = Concatenate(axis=-1)([mlp_encoder_output, cnn_output_simplified, gnn_output_for_batch])

    # Simplified fusion MLP head
    dense_combined = Dense(32, activation='relu')(combined)
    final_prediction = Dense(1, name='final_prediction')(dense_combined)

    # Create the model
    model = Model(inputs=[cnn_input, mlp_input, gnn_output_for_batch], outputs=final_prediction, name='Simplified_Fusion_Model')

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

    return model

# --- 3. Refined Importance Analysis Function ---
def analyze_feature_importance(model, cnn_input, mlp_input, gnn_output, y, mlp_feature_names):
    """
    Analyzes the relative importance of each input branch and also the
    individual features within the MLP branch.
    
    Args:
        model: The trained deep learning model.
        cnn_input: The input data for the CNN branch.
        mlp_input: The input data for the MLP branch.
        gnn_output: The input data for the GNN output branch.
        y: The target values.
        mlp_feature_names: A list of names for the MLP features.

    Returns:
        A dictionary with the percentage importance of each branch and
        a list of the top MLP features.
    """
    # Extract the latent feature models
    latent_cnn_model = Model(inputs=model.input, outputs=model.get_layer('cnn_simplified_output').output)
    latent_mlp_model = Model(inputs=model.input, outputs=model.get_layer('mlp_encoder_output').output)
    
    # Get the latent features from the trained model
    latent_cnn_features = latent_cnn_model.predict([cnn_input, mlp_input, gnn_output])
    latent_mlp_features = latent_mlp_model.predict([cnn_input, mlp_input, gnn_output])
    latent_gnn_features = gnn_output  # GNN output is already a latent feature

    # Combine the latent features for the final linear model
    X_latent = np.concatenate([latent_cnn_features, latent_mlp_features, latent_gnn_features], axis=-1)

    # Define a simple linear model to find the importance of each latent feature group
    input_latent = Input(shape=(X_latent.shape[1],))
    output_latent = Dense(1, activation='linear')(input_latent)
    importance_model = Model(inputs=input_latent, outputs=output_latent)
    
    importance_model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')
    importance_model.fit(X_latent, y, epochs=100, verbose=0)
    
    weights = importance_model.get_weights()[0].flatten()

    # Calculate the total importance for each branch
    cnn_weight_sum = np.sum(np.abs(weights[:latent_cnn_features.shape[1]]))
    mlp_weight_sum = np.sum(np.abs(weights[latent_cnn_features.shape[1] : latent_cnn_features.shape[1] + latent_mlp_features.shape[1]]))
    gnn_weight_sum = np.sum(np.abs(weights[latent_cnn_features.shape[1] + latent_mlp_features.shape[1]:]))

    total_weight_sum = cnn_weight_sum + mlp_weight_sum + gnn_weight_sum

    if total_weight_sum > 0:
        cnn_importance = (cnn_weight_sum / total_weight_sum) * 100
        mlp_importance = (mlp_weight_sum / total_weight_sum) * 100
        gnn_importance = (gnn_weight_sum / total_weight_sum) * 100
    else:
        cnn_importance = mlp_importance = gnn_importance = 0
    
    # --- New section: Analyze individual MLP feature importance ---
    # We get the weights of the first dense layer of the MLP branch.
    # The magnitude of these weights indicates how strongly each input feature
    # influences the hidden neurons of that branch.
    mlp_weights = model.get_layer('mlp_encoder_output').get_weights()[0]
    
    # Sum the absolute values of the weights for each input feature.
    # The result is a single importance score for each original MLP feature.
    feature_importance_scores = np.sum(np.abs(mlp_weights), axis=1)

    # Create a DataFrame for easy sorting and display
    mlp_importance_df = pd.DataFrame({
        'Feature': mlp_feature_names,
        'Importance': feature_importance_scores
    })

    # Sort the features by their importance
    top_mlp_features = mlp_importance_df.sort_values(by='Importance', ascending=False)
    
    return {
        "CNN_Importance": cnn_importance,
        "MLP_Importance": mlp_importance,
        "GNN_Importance": gnn_importance,
        "Top_MLP_Features": top_mlp_features
    }


# --- 4. Main execution loop with new analysis function ---
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

all_importance_scores = []
all_mlp_importance_dfs = []

print(f"--- Starting {n_splits}-fold Cross-Validation for Source Apportionment ---")

for fold, (train_index, val_index) in enumerate(kf.split(cnn_input_data)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")

    X_cnn_train, X_cnn_val = cnn_input_data[train_index], cnn_input_data[val_index]
    X_mlp_train, X_mlp_val = mlp_input_data.iloc[train_index], mlp_input_data.iloc[val_index]
    X_gnn_train, X_gnn_val = gnn_output_data[train_index], gnn_output_data[val_index]
    y_train, y_val = y[train_index], y[val_index]

    model = create_simplified_fusion_model()
    model.fit(
        [X_cnn_train, X_mlp_train, X_gnn_train],
        y_train,
        epochs=30,
        batch_size=1,
        verbose=0
    )

    importance_scores = analyze_feature_importance(
        model, 
        X_cnn_val, 
        X_mlp_val, 
        X_gnn_val, 
        y_val, 
        mlp_input_data.columns.tolist()
    )
    all_importance_scores.append(importance_scores)
    all_mlp_importance_dfs.append(importance_scores['Top_MLP_Features'])
    
    print("Source Apportionment for this Fold:")
    for key, value in importance_scores.items():
        if key != "Top_MLP_Features":
            print(f"  {key}: {value:.2f}%")
    print("\n  Top MLP Variables for this Fold:")
    print(importance_scores['Top_MLP_Features'].to_string(index=False))

# Calculate and print the final average scores
print("\n--- Cross-Validation Complete ---")
avg_cnn_importance = np.mean([s['CNN_Importance'] for s in all_importance_scores])
avg_mlp_importance = np.mean([s['MLP_Importance'] for s in all_importance_scores])
avg_gnn_importance = np.mean([s['GNN_Importance'] for s in all_importance_scores])

# Average the MLP feature importances across all folds
avg_mlp_importance_df = pd.concat(all_mlp_importance_dfs).groupby('Feature').mean().sort_values(by='Importance', ascending=False)


print("Average Source Apportionment across all 5 folds:")
print(f"  CNN (LULC Data): {avg_cnn_importance:.2f}%")
print(f"  MLP (Hydrological Data): {avg_mlp_importance:.2f}%")
print(f"  GNN (Spatial Network Data): {avg_gnn_importance:.2f}%")

print("\nAverage Importance of Individual MLP Variables:")
print(avg_mlp_importance_df.to_string())



--- Starting 5-fold Cross-Validation for Source Apportionment ---

--- Fold 1/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Source Apportionment for this Fold:
  CNN_Importance: 10.14%
  MLP_Importance: 31.61%
  GNN_Importance: 58.25%

  Top MLP Variables for this Fold:
                   Feature  Importance
distance_from_brick_fields    4.490038
           soil_type_index    4.392446
            land_use_index    4.170262
   distance_from_factories    4.098063
      hydrological_index_1    3.626208
      hydrological_index_2    3.313287

--- Fold 2/5 ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Source Apportionment for this Fold:
  CNN_Importance: 18.98%
  MLP_Importance: 19.39%
  GNN_Importance: 61.63%

  Top MLP Variables for this Fold:
                   Feature  Importance
dist