In [1]:
import rasterio
import rasterio.windows
import rasterio.features 
import numpy as np
import os
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points 
import pandas as pd
from sklearn.preprocessing import StandardScaler 
from scipy.spatial.distance import cdist # For calculating distances between points
import networkx as nx 

# --- Configuration ---
# Base directory for all your GIS data files
GIS_BASE_DIR = "/Users/rakibhhridoy/Five_Rivers/gis" # <--- IMPORTANT: Update this path to your GIS data directory

# Subdirectories for LULC and CalIndices TIFF files
LULC_DIR = os.path.join(GIS_BASE_DIR, "LULCMerged")
CAL_INDICES_DIR = os.path.join(GIS_BASE_DIR, "CalIndices")

# Paths to your point shapefiles
SAMPLING_POINTS_PATH = os.path.join(GIS_BASE_DIR, "sampling_point.shp")
BRICK_FIELD_POINTS_PATH = os.path.join(GIS_BASE_DIR, "brick_field_point.shp")
INDUSTRY_POINTS_PATH = os.path.join(GIS_BASE_DIR, "industry_point.shp")

# Output CSV file paths for intermediate and final processed data
OUTPUT_LULC_VARIATIONS_CSV = os.path.join(GIS_BASE_DIR, "LULC_5km_Variations.csv")
OUTPUT_HYDRO_PROPERTIES_CSV = os.path.join(GIS_BASE_DIR, "Hydrological_Properties.csv")
OUTPUT_RASTER_FEATURES_CSV = os.path.join(GIS_BASE_DIR, "Raster_Derived_Features.csv")
OUTPUT_COMBINED_FEATURES_CSV = os.path.join(GIS_BASE_DIR, "Combined_Features_Scaled.csv")

In [2]:
LULC_YEARS = list(range(2017, 2023)) # Covers 2017, 2018, 2019, 2020, 2021, 2022
UNIFORM_BUFFER_RADIUS_METERS = 5000 # 5 km

# CNN Patch Size (pixels) - for extracting image patches around points for CNN branch
# This is a pixel dimension, not a real-world radius.
CNN_PATCH_SIZE = 32 # e.g., 32x32 pixels

# GNN Edge Definition: Distance threshold for connecting nodes (sampling stations)
# This creates a proximity graph as an approximation of hydrological connectivity.
GNN_EDGE_DISTANCE_THRESHOLD_METERS = 5000 # 5 km for connecting stations

# List of raster indices to extract neighborhood statistics from
RASTER_INDICES_TO_EXTRACT = [
    "awei.tif", "bui.tif", "evi.tif", "mndwi.tif", "ndbi.tif",
    "ndbsi.tif", "ndsi.tif", "ndvi.tif", "ndwi.tif", "savi.tif", "ui.tif"
]
# Path to DEM file
DEM_PATH = os.path.join(GIS_BASE_DIR, "DEMF.tif")

In [3]:
# --- Helper Functions ---
# These functions are designed to encapsulate common geospatial operations,
# making the main pipeline cleaner and more modular.

def load_raster(filepath, reference_crs):
    """
    Loads a single raster file. If its CRS does not match the reference_crs,
    it reprojects the raster to the reference_crs.
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Raster file not found: {filepath}")
    
    with rasterio.open(filepath) as src:
        if src.crs != reference_crs:
            print(f"Reprojecting {os.path.basename(filepath)} from {src.crs} to {reference_crs}...")
            from rasterio.warp import reproject, Resampling
            
            # Calculate new transform and dimensions for reprojection
            transform, width, height = rasterio.windows.calculate_default_transform(
                src.crs, reference_crs, src.width, src.height, *src.bounds
            )
            reprojected_array = np.empty((height, width), dtype=src.dtypes)
            reproject(
                source=rasterio.band(src, 1),
                destination=reprojected_array,
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=reference_crs,
                resampling=Resampling.nearest # Use nearest for categorical LULC, bilinear for continuous DEM/indices
            )
            profile = src.profile.copy()
            profile.update({
                'crs': reference_crs,
                'transform': transform,
                'width': width,
                'height': height
            })
            return reprojected_array, profile
        else:
            return src.read(1), src.profile

def extract_neighborhood_stats(raster_array, raster_profile, point_geom, buffer_meters, stat_type='mean'):
    """
    Extracts neighborhood statistics (mean, stddev) from a raster for a given point buffer.
    This function crops the raster to the buffer's bounding box and then masks it with the circular buffer.
    """
    buffer_geom = point_geom.buffer(buffer_meters)
    
    # Get the window (bounding box in pixel coordinates) for the buffer
    window = rasterio.windows.from_bounds(*buffer_geom.bounds, transform=raster_profile['transform'])
    
    # Clamp window to raster dimensions to prevent out-of-bounds access
    row_start, row_stop = int(window.row_off), int(window.row_off + window.height)
    col_start, col_stop = int(window.col_off), int(window.col_off + window.width)
    
    row_start = max(0, row_start)
    row_stop = min(raster_array.shape[0], row_stop)
    col_start = max(0, col_start)
    col_stop = min(raster_array.shape[1], col_stop)

    if row_stop <= row_start or col_stop <= col_start:
        return np.nan # No valid overlap
    
    # Extract the subset of the raster array (rectangular crop)
    cropped_array = raster_array[row_start:row_stop, col_start:col_stop]
    # Create a new transform for this cropped array's extent
    cropped_transform = rasterio.windows.transform(window, raster_profile['transform'])

    # Rasterize the buffer geometry onto a new array of the same shape as cropped_array
    pixel_mask_for_buffer = rasterio.features.rasterize(
        [buffer_geom],
        out_shape=cropped_array.shape,
        transform=cropped_transform,
        fill=0,
        all_touched=False, # Only pixels strictly inside
        dtype=np.uint8
    ).astype(bool) # Convert to boolean mask (True for inside, False for outside)

    # Apply the pixel mask to the cropped array to get pixels within the circular buffer
    pixels_within_buffer = cropped_array[pixel_mask_for_buffer]
    
    if pixels_within_buffer.size > 0:
        if stat_type == 'mean':
            return np.mean(pixels_within_buffer)
        elif stat_type == 'std':
            return np.std(pixels_within_buffer)
        else:
            raise ValueError("Unsupported stat_type. Use 'mean' or 'std'.")
    else:
        return np.nan # No valid pixels in buffer

def extract_raster_patch(raster_array, raster_profile, point_geom, patch_size):
    """
    Extracts a square image patch around a point for CNN input.
    Handles boundary conditions by padding the raster.
    """
    # Convert point_geom to pixel coordinates
    col, row = raster_profile['transform'](point_geom.x, point_geom.y, op=~raster_profile['transform'])
    col, row = int(col), int(row)

    half_patch = patch_size // 2

    # Create a padded array to handle points near edges gracefully
    padded_array = np.pad(raster_array, half_patch, mode='reflect') 
    
    # Adjust coordinates for the padded array
    padded_row_start = row + half_patch - half_patch
    padded_row_end = row + half_patch + half_patch
    padded_col_start = col + half_patch - half_patch
    padded_col_end = col + half_patch + half_patch

    # Extract patch from padded array
    patch = padded_array[padded_row_start:padded_row_end, padded_col_start:padded_col_end]
    
    # Ensure the patch is exactly patch_size x patch_size
    if patch.shape != (patch_size, patch_size):
        print(f"Warning: Patch for point at ({point_geom.x}, {point_geom.y}) is not {patch_size}x{patch_size}. Returning NaN array.")
        return np.full((patch_size, patch_size), np.nan) # Return NaN array if extraction fails
    
    return patch

In [4]:
rainy_df = pd.read_csv("data/Hydro_LULC_Rainy.csv")
rainy_df.head()

Unnamed: 0,Stations,River,Lat,Long,geometry,hydrological_dist_to_nearest_BF,num_upstream_BF,hydrological_dist_to_nearest_IND,num_upstream_IND,CrR,...,MR,SandR,SiltR,ClayR,FeR,variation_17_18,variation_18_19,variation_19_20,variation_20_21,variation_21_22
0,S1,Dhaleshwari,23.91026,90.229845,POINT (10044340.399756001 2742476.70627368),0.0002,6,0.00679,2,92.69,...,32.47,22,43,35,26700,14.396884,8.830319,11.964628,10.162627,10.226519
1,S2,Dhaleshwari,23.858227,90.240038,POINT (10045475.079325657 2736141.9436627394),2e-05,1,0.0,0,88.4,...,30.11,11,62,26,34970,12.920722,9.676226,13.489252,10.422661,11.249439
2,S3,Dhaleshwari,23.802571,90.24539,POINT (10046070.861240383 2729368.914036628),0.0002,6,0.0021,4,66.92,...,30.58,47,26,23,23970,11.966286,10.48311,15.459976,10.687158,11.446218
3,S4,Dhaleshwari,23.754298,90.246581,POINT (10046203.442753918 2723496.704977303),0.073026,2,0.027234,6,55.56,...,31.62,59,27,18,23990,21.385412,15.447605,20.766508,13.685776,13.207538
4,S5,Dhaleshwari,23.702157,90.277077,POINT (10049598.24194515 2717156.4153029486),0.074113,15,0.122163,5,64.5,...,32.84,61,24,10,35130,18.108141,14.249969,17.320514,12.567463,11.587011


In [5]:
rainy_df.columns

Index(['Stations', 'River', 'Lat', 'Long', 'geometry',
       'hydrological_dist_to_nearest_BF', 'num_upstream_BF',
       'hydrological_dist_to_nearest_IND', 'num_upstream_IND', 'CrR', 'NiR',
       'CuR', 'AsR', 'CdR', 'PbR', 'MR', 'SandR', 'SiltR', 'ClayR', 'FeR',
       'variation_17_18', 'variation_18_19', 'variation_19_20',
       'variation_20_21', 'variation_21_22'],
      dtype='object')

In [6]:
hm_col = ['Stations', 'River', 'Lat', 'Long', 'CrR', 'NiR',
       'CuR', 'AsR', 'CdR', 'PbR', 'FeR']

In [7]:
heavy_metal_df = rainy_df[hm_col]
heavy_metal_df.set_index('Stations', inplace=True)
heavy_metal_df.head()

Unnamed: 0_level_0,River,Lat,Long,CrR,NiR,CuR,AsR,CdR,PbR,FeR
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
S1,Dhaleshwari,23.91026,90.229845,92.69,19.18,40.34,7.96,2.66,50.73,26700
S2,Dhaleshwari,23.858227,90.240038,88.4,17.21,41.56,9.88,2.97,38.9,34970
S3,Dhaleshwari,23.802571,90.24539,66.92,37.52,49.47,15.48,2.1,32.79,23970
S4,Dhaleshwari,23.754298,90.246581,55.56,26.08,69.77,18.77,1.79,43.4,23990
S5,Dhaleshwari,23.702157,90.277077,64.5,30.62,73.19,20.96,1.45,53.55,35130


In [8]:
print("\n--- Step 1.2: Loading LULC Rasters and establishing Reference CRS/Transform ---")
lulc_rasters = {}
lulc_profiles = {}
reference_crs = None
reference_transform = None
full_raster_height = None
full_raster_width = None

try:
    # Use the first LULC file to get the reference CRS and transform
    first_lulc_file = os.path.join(LULC_DIR, "LULC2017.tif")
    if not os.path.exists(first_lulc_file):
        raise FileNotFoundError(f"Reference LULC file not found: {first_lulc_file}")

    with rasterio.open(first_lulc_file) as src:
        reference_crs = src.crs
        reference_transform = src.transform
        full_raster_height, full_raster_width = src.shape

    for year in LULC_YEARS:
        filepath = os.path.join(LULC_DIR, f"LULC{year}.tif")
        # Load and reproject if necessary using the helper function
        lulc_rasters[year], lulc_profiles[year] = load_raster(filepath, reference_crs)
        
    if not lulc_rasters:
        raise ValueError("No LULC rasters were loaded. Check LULC_DIR and file names.")
        
    print(f"Loaded LULC rasters for years: {list(lulc_rasters.keys())}")
    print(f"Reference CRS (from LULC): {reference_crs}")

except Exception as e:
    print(f"Error loading LULC rasters or establishing reference: {e}")
    exit()


--- Step 1.2: Loading LULC Rasters and establishing Reference CRS/Transform ---
Loaded LULC rasters for years: [2017, 2018, 2019, 2020, 2021, 2022]
Reference CRS (from LULC): EPSG:32646


In [9]:
# Step 1.3: Load Sampling Points (reprojected to reference_crs)
try:
    sampling_points_gdf = gpd.read_file(SAMPLING_POINTS_PATH)
    if 'Stations' not in sampling_points_gdf.columns:
        sampling_points_gdf['Stations'] = [f"S{i+1}" for i in range(len(sampling_points_gdf))]
    sampling_points_gdf.set_index('Stations', inplace=True)

    if sampling_points_gdf.crs != reference_crs:
        sampling_points_gdf = sampling_points_gdf.to_crs(reference_crs)
        print(f"Reprojected sampling points from {sampling_points_gdf.crs} to {reference_crs}.")
    print(f"Loaded {len(sampling_points_gdf)} sampling points.")

except Exception as e:
    print(f"Error loading or reprojecting sampling points: {e}")
    exit()

Reprojected sampling points from PROJCS["WGS 84 / UTM zone 46N",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",93],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["EPSG","32646"]] to EPSG:32646.
Loaded 17 sampling points.


## HYDRO & LULC

In [10]:
# --- Step 2: Calculate Hydrological and LULC Features ---
print("\n--- Step 2: Calculating Hydrological Properties and LULC Variations ---")

# Load Brick Field and Industry Points (reprojected to reference_crs)
brick_field_points_gdf = None
industry_points_gdf = None
try:
    brick_field_points_gdf = gpd.read_file(BRICK_FIELD_POINTS_PATH)
    industry_points_gdf = gpd.read_file(INDUSTRY_POINTS_PATH)

    if brick_field_points_gdf.crs != reference_crs:
        brick_field_points_gdf = brick_field_points_gdf.to_crs(reference_crs)
    if industry_points_gdf.crs != reference_crs:
        industry_points_gdf = industry_points_gdf.to_crs(reference_crs)
    print(f"Loaded {len(brick_field_points_gdf)} brick field points and {len(industry_points_gdf)} industry points.")
except Exception as e:
    print(f"Error loading or reprojecting brick field/industry points: {e}")
    exit()


--- Step 2: Calculating Hydrological Properties and LULC Variations ---
Loaded 270 brick field points and 195 industry points.


In [11]:
# Load DEM Raster
dem_array = None
dem_profile = None
try:
    dem_array, dem_profile = load_raster(DEM_PATH, reference_crs)
    print(f"Loaded DEM raster with shape: {dem_array.shape}")
except Exception as e:
    print(f"Error loading DEM raster: {e}")
    exit()

# Generate Binary Change Maps (Year-to-Year)
change_maps = {}
for i in range(len(LULC_YEARS) - 1):
    year1 = LULC_YEARS[i]
    year2 = LULC_YEARS[i+1]
    
    lulc_t1 = lulc_rasters[year1]
    lulc_t2 = lulc_rasters[year2]
    
    if lulc_t1.shape != lulc_t2.shape:
        print(f"Warning: LULC raster shapes mismatch for {year1} and {year2}. Skipping change map for this interval.")
        continue
    change_map_array = (lulc_t1 != lulc_t2).astype(np.uint8)
    change_maps[f'{year1}-{year2}'] = change_map_array

Loaded DEM raster with shape: (6266, 5764)


In [13]:
lulc_variation_results = []
hydro_properties_results = []
UNIFORM_BUFFER_RADIUS_METERS = 5000
OUTPUT_LULC_VARIATIONS_CSV = 'LULC_5km_Variations.csv'
OUTPUT_HYDRO_PROPERTIES_CSV = 'Hydrological_Properties.csv'


for station_id, point_row in sampling_points_gdf.iterrows():
    point_geom = point_row.geometry
    
    point_lulc_results = {'Stations': station_id}
    point_hydro_results = {'Stations': station_id}

    # Calculate LULC Variations
    for interval, change_map_array in change_maps.items():
        try:
            proportion_changed = extract_neighborhood_stats(
                change_map_array, lulc_profiles[LULC_YEARS[0]], 
                point_geom, UNIFORM_BUFFER_RADIUS_METERS, stat_type='mean'
            )
            point_lulc_results[f'variation_{interval}'] = proportion_changed * 100
        except Exception as e:
            print(f"Error processing LULC for point {station_id} for interval {interval}: {e}")
            point_lulc_results[f'variation_{interval}'] = np.nan
    lulc_variation_results.append(point_lulc_results)

    # Calculate Hydrological Properties
    try:
        # Correctly get DEM Point Value using the inverse transform
        dem_transform_inv = ~dem_profile['transform']
        col, row = dem_transform_inv * (point_geom.x, point_geom.y)

        if 0 <= row < dem_array.shape[0] and 0 <= col < dem_array.shape[1]:
            point_hydro_results['dem_point_value'] = dem_array[int(row), int(col)]
        else:
            point_hydro_results['dem_point_value'] = np.nan

        # DEM Mean and StdDev in 5km radius
        mean_dem = extract_neighborhood_stats(dem_array, dem_profile, point_geom, UNIFORM_BUFFER_RADIUS_METERS, 'mean')
        std_dem = extract_neighborhood_stats(dem_array, dem_profile, point_geom, UNIFORM_BUFFER_RADIUS_METERS, 'std')
        point_hydro_results['dem_mean_5km'] = mean_dem
        point_hydro_results['dem_std_5km'] = std_dem

        # Euclidean Distance to Nearest Brick Field
        if not brick_field_points_gdf.empty:
            nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
            distance_bf = point_geom.distance(nearest_bf_geom)
            point_hydro_results['dist_to_nearest_BF'] = distance_bf
        else:
            point_hydro_results['dist_to_nearest_BF'] = np.nan

        # Euclidean Distance to Nearest Industry
        if not industry_points_gdf.empty:
            nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
            distance_ind = point_geom.distance(nearest_ind_geom)
            point_hydro_results['dist_to_nearest_IND'] = distance_ind
        else:
            point_hydro_results['dist_to_nearest_IND'] = np.nan

        # Count sources within 5km circular influence radius
        # This serves as a proxy for "num_upstream" without a true hydrological network.
        bf_in_radius = brick_field_points_gdf[brick_field_points_gdf.within(point_geom.buffer(UNIFORM_BUFFER_RADIUS_METERS))]
        point_hydro_results['num_within_5km_BF'] = len(bf_in_radius)
        
        ind_in_radius = industry_points_gdf[industry_points_gdf.within(point_geom.buffer(UNIFORM_BUFFER_RADIUS_METERS))]
        point_hydro_results['num_within_5km_IND'] = len(ind_in_radius)

    except Exception as e:
        print(f"Error processing hydrological features for point {station_id}: {e}")
        for key in ['dem_point_value', 'dem_mean_5km', 'dem_std_5km', 'dist_to_nearest_BF', 'dist_to_nearest_IND', 'num_within_5km_BF', 'num_within_5km_IND']:
            point_hydro_results[key] = np.nan
    
    hydro_properties_results.append(point_hydro_results)


lulc_variations_df_calculated = pd.DataFrame(lulc_variation_results).set_index('Stations')
lulc_variations_df_calculated.to_csv(OUTPUT_LULC_VARIATIONS_CSV)
print(f"LULC Variations calculated and saved to {OUTPUT_LULC_VARIATIONS_CSV}")

hydro_properties_df_calculated = pd.DataFrame(hydro_properties_results).set_index('Stations')
hydro_properties_df_calculated.to_csv(OUTPUT_HYDRO_PROPERTIES_CSV)
print(f"Hydrological Properties calculated and saved to {OUTPUT_HYDRO_PROPERTIES_CSV}")
print("\nFirst 5 rows of Hydrological Properties:")
print(hydro_properties_df_calculated.head())
hydro_properties_df_calculated

  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_

LULC Variations calculated and saved to LULC_5km_Variations.csv
Hydrological Properties calculated and saved to Hydrological_Properties.csv

First 5 rows of Hydrological Properties:
          dem_point_value  dem_mean_5km  dem_std_5km  dist_to_nearest_BF  \
Stations                                                                   
S1                  -45.0    -44.633938     3.938536         2647.402248   
S2                  -51.0    -45.282639     3.612707         3113.557901   
S3                  -52.0    -46.590000     2.837222         1841.694727   
S4                  -53.0    -47.396420     2.501793         2737.503779   
S5                  -47.0    -48.142467     2.406125         1637.673006   

          dist_to_nearest_IND  num_within_5km_BF  num_within_5km_IND  
Stations                                                              
S1                 522.326002                  5                  11  
S2                3000.625941                  4                   3  
S

  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]
  nearest_bf_geom = nearest_points(point_geom, brick_field_points_gdf.unary_union)[1]
  nearest_ind_geom = nearest_points(point_geom, industry_points_gdf.unary_union)[1]


Unnamed: 0_level_0,dem_point_value,dem_mean_5km,dem_std_5km,dist_to_nearest_BF,dist_to_nearest_IND,num_within_5km_BF,num_within_5km_IND
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1,-45.0,-44.633938,3.938536,2647.402248,522.326002,5,11
S2,-51.0,-45.282639,3.612707,3113.557901,3000.625941,4,3
S3,-52.0,-46.59,2.837222,1841.694727,1298.585986,11,7
S4,-53.0,-47.39642,2.501793,2737.503779,317.052883,8,5
S5,-47.0,-48.142467,2.406125,1637.673006,3307.441888,1,3
S6,-51.0,-48.655735,2.278458,3880.16797,1519.752716,4,4
S7,-50.0,-49.021744,2.115314,311.431998,4735.791556,34,1
S8,-50.0,-45.429237,4.920071,719.967325,936.147517,6,7
S9,-51.0,-45.847088,5.13817,1772.54035,3219.036625,11,2
S10,-45.0,-46.211437,4.946003,1448.820004,803.257359,32,3


In [20]:
# --- Step 4: Extract Raster-Derived Environmental Indices (5km radius) ---
print(f"\n--- Step 4: Extracting Raster-Derived Environmental Indices within {UNIFORM_BUFFER_RADIUS_METERS/1000}km radius ---")

raster_features_results = []

# Load all specified raster indices
loaded_indices = {}
for index_file in RASTER_INDICES_TO_EXTRACT:
    try:
        index_path = os.path.join(CAL_INDICES_DIR, index_file)
        
        # Use rasterio's built-in reprojection
        with rasterio.open(index_path) as src:
            # Reproject to target CRS if needed
            if src.crs != reference_crs:
                transform, width, height = rasterio.warp.calculate_default_transform(
                    src.crs, reference_crs, src.width, src.height, *src.bounds
                )
                array = np.empty((height, width))
                rasterio.warp.reproject(
                    src.read(1),
                    array,
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=reference_crs,
                    resampling=rasterio.warp.Resampling.bilinear
                )
                profile = src.profile.copy()
                profile.update({
                    'crs': reference_crs,
                    'transform': transform,
                    'width': width,
                    'height': height
                })
            else:
                array = src.read(1)
                profile = src.profile
            
        loaded_indices[index_file] = {'array': array, 'profile': profile}
        print(f"Successfully loaded {index_file}")
    except Exception as e:
        print(f"Error loading {index_file}: {str(e)}. Skipping.")
        continue

# Make sure sampling_points_gdf has a 'Stations' column
if 'Stations' not in sampling_points_gdf.columns:
    sampling_points_gdf['Stations'] = [f'S{i+1}' for i in range(len(sampling_points_gdf))]

for station_id, point_row in sampling_points_gdf.iterrows():
    point_geom = point_row.geometry
    station_name = point_row['Stations']  # Get station name from the row
    
    point_results = {'Stations': station_name}  # Use the station name from the DataFrame

    for index_file, data in loaded_indices.items():
        index_name = os.path.splitext(index_file)[0]  # Get base name without extension
        
        try:
            # Extract mean and std values
            mean_val = extract_neighborhood_stats(
                data['array'], 
                data['profile'], 
                point_geom, 
                UNIFORM_BUFFER_RADIUS_METERS, 
                'mean'
            )
            std_val = extract_neighborhood_stats(
                data['array'], 
                data['profile'], 
                point_geom, 
                UNIFORM_BUFFER_RADIUS_METERS, 
                'std'
            )
            
            point_results[f'{index_name}_Mean_5km'] = mean_val
            point_results[f'{index_name}_Std_5km'] = std_val
            
        except Exception as e:
            print(f"Error extracting {index_name} features for station {station_name}: {str(e)}")
            point_results[f'{index_name}_Mean_5km'] = np.nan
            point_results[f'{index_name}_Std_5km'] = np.nan
            
    raster_features_results.append(point_results)

# Create DataFrame and set 'Stations' as index
raster_features_df = pd.DataFrame(raster_features_results)
if not raster_features_df.empty:
    raster_features_df.set_index('Stations', inplace=True)
    raster_features_df.to_csv(OUTPUT_RASTER_FEATURES_CSV)
    print(f"Raster-Derived Features calculated and saved to {OUTPUT_RASTER_FEATURES_CSV}")
    print("\nFirst 5 rows of Raster-Derived Features:")
    print(raster_features_df.head())
else:
    print("Warning: No raster features were extracted. Output file not created.")


--- Step 4: Extracting Raster-Derived Environmental Indices within 5.0km radius ---
Successfully loaded awei.tif
Successfully loaded bui.tif
Successfully loaded evi.tif
Successfully loaded mndwi.tif
Successfully loaded ndbi.tif
Successfully loaded ndbsi.tif
Successfully loaded ndsi.tif
Successfully loaded ndvi.tif
Successfully loaded ndwi.tif
Successfully loaded savi.tif
Successfully loaded ui.tif
Raster-Derived Features calculated and saved to /Users/rakibhhridoy/Five_Rivers/gis/Raster_Derived_Features.csv

First 5 rows of Raster-Derived Features:
          awei_Mean_5km  awei_Std_5km  bui_Mean_5km  bui_Std_5km  \
Stations                                                           
S1            -0.970974      0.121782     -0.398687     0.266561   
S2            -0.977016      0.117145     -0.409619     0.264413   
S3            -0.972220      0.125640     -0.358253     0.291229   
S4            -0.971724      0.122795     -0.419392     0.285069   
S5            -0.974956      0.11621

In [19]:
raster_features_df

Unnamed: 0_level_0,awei_Mean_5km,awei_Std_5km,bui_Mean_5km,bui_Std_5km,evi_Mean_5km,evi_Std_5km,mndwi_Mean_5km,mndwi_Std_5km,ndbi_Mean_5km,ndbi_Std_5km,...,ndsi_Mean_5km,ndsi_Std_5km,ndvi_Mean_5km,ndvi_Std_5km,ndwi_Mean_5km,ndwi_Std_5km,savi_Mean_5km,savi_Std_5km,ui_Mean_5km,ui_Std_5km
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S1,-0.970974,0.121782,-0.398687,0.266561,0.375045,0.194776,-0.261435,0.082982,-0.067305,0.110697,...,-0.261435,0.082982,0.331498,0.164419,-0.319363,0.137496,0.272334,0.136775,-0.200377,0.143458
S2,-0.977016,0.117145,-0.409619,0.264413,0.388628,0.19155,-0.271119,0.081485,-0.06873,0.108466,...,-0.271119,0.081485,0.340927,0.164087,-0.329941,0.136942,0.282308,0.135079,-0.20012,0.144412
S3,-0.97222,0.12564,-0.358253,0.291229,0.361353,0.210176,-0.269373,0.083794,-0.045821,0.115648,...,-0.269373,0.083794,0.312673,0.182824,-0.306228,0.156592,0.262196,0.150437,-0.160538,0.157549
S4,-0.971724,0.122795,-0.419392,0.285069,0.40822,0.206966,-0.27849,0.079952,-0.068137,0.110953,...,-0.27849,0.079952,0.351437,0.179951,-0.33486,0.155128,0.29598,0.147714,-0.19894,0.148451
S5,-0.974956,0.116216,-0.388471,0.278087,0.387919,0.207207,-0.275202,0.076475,-0.054336,0.104939,...,-0.275202,0.076475,0.334205,0.177889,-0.319534,0.150067,0.279881,0.147159,-0.183206,0.14157
S6,-0.97281,0.113159,-0.431385,0.272123,0.410806,0.205271,-0.269834,0.073287,-0.075617,0.108192,...,-0.269834,0.073287,0.355979,0.169932,-0.333875,0.142876,0.295819,0.143758,-0.212475,0.143652
S7,-0.963771,0.132233,-0.368749,0.276718,0.372262,0.205278,-0.26765,0.082475,-0.047322,0.110454,...,-0.26765,0.082475,0.321499,0.173728,-0.306672,0.147324,0.267411,0.144539,-0.1668,0.149097
S8,-0.920681,0.20401,-0.277171,0.280538,0.294065,0.218499,-0.230704,0.123064,-0.023973,0.116259,...,-0.230704,0.123064,0.253701,0.182819,-0.249957,0.162306,0.209311,0.151779,-0.123356,0.158084
S9,-0.927573,0.192626,-0.262372,0.288161,0.279277,0.217475,-0.226294,0.113976,-0.02037,0.120301,...,-0.226294,0.113976,0.242361,0.182765,-0.242052,0.159821,0.19977,0.152163,-0.119157,0.163844
S10,-0.933213,0.184802,-0.197565,0.287528,0.233909,0.212133,-0.219427,0.105856,0.004982,0.117966,...,-0.219427,0.105856,0.20275,0.181691,-0.211225,0.15893,0.167244,0.149108,-0.076646,0.162718


In [26]:
# --- Step 4: Extract Raster-Derived Environmental Indices (5km radius) ---
UNIFORM_BUFFER_RADIUS_METERS = 5000

raster_features_results = []

# Load all specified raster indices
loaded_indices = {}
for index_file in RASTER_INDICES_TO_EXTRACT:
    try:
        index_path = os.path.join(CAL_INDICES_DIR, index_file)
        
        # Use rasterio's built-in reprojection
        with rasterio.open(index_path) as src:
            # Reproject to target CRS if needed
            if src.crs != reference_crs:
                transform, width, height = rasterio.warp.calculate_default_transform(
                    src.crs, reference_crs, src.width, src.height, *src.bounds
                )
                array = np.empty((height, width))
                rasterio.warp.reproject(
                    src.read(1),
                    array,
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=reference_crs,
                    resampling=rasterio.warp.Resampling.bilinear
                )
                profile = src.profile.copy()
                profile.update({
                    'crs': reference_crs,
                    'transform': transform,
                    'width': width,
                    'height': height
                })
            else:
                array = src.read(1)
                profile = src.profile
            
        loaded_indices[index_file] = {'array': array, 'profile': profile}
        print(f"Successfully loaded {index_file}")
    except Exception as e:
        print(f"Error loading {index_file}: {str(e)}. Skipping.")
        continue

# Make sure sampling_points_gdf has a 'Stations' column
if 'Stations' not in sampling_points_gdf.columns:
    sampling_points_gdf['Stations'] = [f'S{i+1}' for i in range(len(sampling_points_gdf))]

for station_id, point_row in sampling_points_gdf.iterrows():
    point_geom = point_row.geometry
    station_name = point_row['Stations']  # Get station name from the row
    
    point_results = {'Stations': station_name}  # Use the station name from the DataFrame

    for index_file, data in loaded_indices.items():
        index_name = os.path.splitext(index_file)[0]  # Get base name without extension
        
        try:
            # Extract mean and std values
            mean_val = extract_neighborhood_stats(
                data['array'], 
                data['profile'], 
                point_geom, 
                UNIFORM_BUFFER_RADIUS_METERS, 
                'mean'
            )
            std_val = extract_neighborhood_stats(
                data['array'], 
                data['profile'], 
                point_geom, 
                UNIFORM_BUFFER_RADIUS_METERS, 
                'std'
            )
            
            point_results[f'{index_name}_Mean_5km'] = mean_val
            point_results[f'{index_name}_Std_5km'] = std_val
            
        except Exception as e:
            print(f"Error extracting {index_name} features for station {station_name}: {str(e)}")
            point_results[f'{index_name}_Mean_5km'] = np.nan
            point_results[f'{index_name}_Std_5km'] = np.nan
            
    raster_features_results.append(point_results)

# Create DataFrame and set 'Stations' as index
raster_features_df = pd.DataFrame(raster_features_results)
if not raster_features_df.empty:
    raster_features_df.set_index('Stations', inplace=True)
    raster_features_df.to_csv(OUTPUT_RASTER_FEATURES_CSV)
    print(f"Raster-Derived Features calculated and saved to {OUTPUT_RASTER_FEATURES_CSV}")
    print("\nFirst 5 rows of Raster-Derived Features:")
    print(raster_features_df.head())
else:
    print("Warning: No raster features were extracted. Output file not created.")
raster_features_df

Successfully loaded awei.tif
Successfully loaded bui.tif
Successfully loaded evi.tif
Successfully loaded mndwi.tif
Successfully loaded ndbi.tif
Successfully loaded ndbsi.tif
Successfully loaded ndsi.tif
Successfully loaded ndvi.tif
Successfully loaded ndwi.tif
Successfully loaded savi.tif
Successfully loaded ui.tif
Raster-Derived Features calculated and saved to /Users/rakibhhridoy/Five_Rivers/gis/Raster_Derived_Features.csv

First 5 rows of Raster-Derived Features:
          awei_Mean_5km  awei_Std_5km  bui_Mean_5km  bui_Std_5km  \
Stations                                                           
S1            -0.969734      0.125220     -0.392400     0.268581   
S2            -0.975545      0.121653     -0.404656     0.270563   
S3            -0.969130      0.133851     -0.344786     0.295459   
S4            -0.966623      0.133369     -0.413579     0.287817   
S5            -0.975206      0.115340     -0.385491     0.277192   

          evi_Mean_5km  evi_Std_5km  mndwi_Mean_5km 

Unnamed: 0_level_0,awei_Mean_5km,awei_Std_5km,bui_Mean_5km,bui_Std_5km,evi_Mean_5km,evi_Std_5km,mndwi_Mean_5km,mndwi_Std_5km,ndbi_Mean_5km,ndbi_Std_5km,...,ndsi_Mean_5km,ndsi_Std_5km,ndvi_Mean_5km,ndvi_Std_5km,ndwi_Mean_5km,ndwi_Std_5km,savi_Mean_5km,savi_Std_5km,ui_Mean_5km,ui_Std_5km
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S1,-0.969734,0.12522,-0.3924,0.268581,0.370143,0.196729,-0.258778,0.084377,-0.065579,0.110679,...,-0.258778,0.084377,0.326918,0.16653,-0.315127,0.139971,0.268463,0.138414,-0.196683,0.144285
S2,-0.975545,0.121653,-0.404656,0.270563,0.38359,0.195119,-0.267662,0.083385,-0.068138,0.11101,...,-0.267662,0.083385,0.336557,0.167914,-0.325971,0.140518,0.278609,0.137878,-0.197708,0.148793
S3,-0.96913,0.133851,-0.344786,0.295459,0.353004,0.212681,-0.268019,0.086924,-0.040037,0.116289,...,-0.268019,0.086924,0.304957,0.186484,-0.299539,0.160875,0.256057,0.152663,-0.15111,0.159738
S4,-0.966623,0.133369,-0.413579,0.287817,0.404575,0.208671,-0.277133,0.084082,-0.065932,0.110034,...,-0.277133,0.084082,0.347748,0.18341,-0.331451,0.160435,0.293582,0.149312,-0.19633,0.147066
S5,-0.975206,0.11534,-0.385491,0.277192,0.386739,0.206494,-0.27688,0.075112,-0.052333,0.103177,...,-0.27688,0.075112,0.333129,0.178004,-0.319351,0.150169,0.279081,0.14661,-0.18114,0.139715
S6,-0.970859,0.116927,-0.420915,0.275077,0.403468,0.207132,-0.26724,0.074591,-0.0718,0.109332,...,-0.26724,0.074591,0.349287,0.171751,-0.327956,0.144789,0.290144,0.145079,-0.206604,0.145446
S7,-0.962687,0.135294,-0.3697,0.276092,0.373285,0.203899,-0.267627,0.08263,-0.047726,0.108989,...,-0.267627,0.08263,0.321981,0.174021,-0.307016,0.147871,0.268282,0.143934,-0.167008,0.147687
S8,-0.912257,0.214371,-0.273639,0.281524,0.289075,0.219483,-0.225617,0.12709,-0.024594,0.1174,...,-0.225617,0.12709,0.249634,0.183916,-0.245642,0.164066,0.205987,0.152365,-0.123514,0.158304
S9,-0.911569,0.209866,-0.268629,0.297243,0.27909,0.224655,-0.214845,0.119808,-0.02806,0.125581,...,-0.214845,0.119808,0.241047,0.187538,-0.237883,0.165049,0.19898,0.156682,-0.127595,0.169854
S10,-0.926832,0.191991,-0.186091,0.295221,0.22411,0.216209,-0.21266,0.106769,0.006925,0.123143,...,-0.21266,0.106769,0.193279,0.18446,-0.202531,0.161284,0.15981,0.151603,-0.070946,0.168767


In [28]:
# --- Step 4: Consolidate All Features, Scale Data, and Create Multi-Modal Inputs ---
print("\n--- Step 4: Consolidating All Features and Creating Multi-Modal Inputs ---")

# Step 4.1: Consolidate all dataframes
combined_features_df = heavy_metal_df.copy()
combined_features_df = combined_features_df.merge(lulc_variations_df_calculated, left_index=True, right_index=True, how='left')
combined_features_df = combined_features_df.merge(hydro_properties_df_calculated, left_index=True, right_index=True, how='left')
combined_features_df = combined_features_df.merge(raster_features_df, left_index=True, right_index=True, how='left')
combined_features_df.to_csv(OUTPUT_COMBINED_FEATURES_CSV)

print(f"Combined features shape: {combined_features_df.shape}")
print("\nFirst 5 rows of Combined Features:")
print(combined_features_df)

# Step 4.2: Prepare Inputs for each model type
# Split data into features (X) and target (y)
X_df = combined_features_df.drop(columns=['CrR', 'NiR', 'CuR', 'AsR', 'CdR', 'PbR', "FeR"]) # Example: Dropping heavy metal columns
y_df = combined_features_df[['CrR', 'NiR', 'CuR', 'AsR', 'CdR', 'PbR', "FeR"]]
print(f"Features (X) shape: {X_df.shape}, Targets (y) shape: {y_df.shape}")


--- Step 4: Consolidating All Features and Creating Multi-Modal Inputs ---
Combined features shape: (17, 44)

First 5 rows of Combined Features:
                 River        Lat       Long    CrR    NiR     CuR    AsR  \
Stations                                                                    
S1         Dhaleshwari  23.910260  90.229845  92.69  19.18   40.34   7.96   
S2         Dhaleshwari  23.858227  90.240038  88.40  17.21   41.56   9.88   
S3         Dhaleshwari  23.802571  90.245390  66.92  37.52   49.47  15.48   
S4         Dhaleshwari  23.754298  90.246581  55.56  26.08   69.77  18.77   
S5         Dhaleshwari  23.702157  90.277077  64.50  30.62   73.19  20.96   
S6         Dhaleshwari  23.657826  90.317763  60.35  11.72   98.28  10.42   
S7         Dhaleshwari  23.628000  90.388647  69.37  14.95   96.72  19.88   
S8               Turag  23.877666  90.350573  15.70  40.76   73.87  11.36   
S9               Turag  23.826986  90.342201  12.74  39.43   87.20  10.78   
S10    

In [30]:
# MLP Input: Scaled Tabular Data
# Drop non-numerical columns for scaling
X_tabular_numeric = X_df.select_dtypes(include=np.number)
scaler = StandardScaler()
X_mlp_input = scaler.fit_transform(X_tabular_numeric)
print(f"MLP Input (Scaled Tabular Features) shape: {X_mlp_input.shape}")

MLP Input (Scaled Tabular Features) shape: (17, 36)


In [34]:
def extract_raster_patch(raster_array, raster_profile, point_geom, patch_size):
    """
    Extracts a square image patch around a point for CNN input.
    Handles boundary conditions by padding the raster.
    """
    # Convert point_geom to pixel coordinates
    transform = raster_profile['transform']
    col, row = ~transform * (point_geom.x, point_geom.y)  # Correct coordinate transformation
    col, row = int(round(col)), int(round(row))
    
    half_patch = patch_size // 2
    
    # Get array dimensions
    height, width = raster_array.shape
    
    # Calculate bounds with padding if needed
    row_start = max(0, row - half_patch)
    row_end = min(height, row + half_patch + 1)
    col_start = max(0, col - half_patch)
    col_end = min(width, col + half_patch + 1)
    
    # Extract the patch
    patch = raster_array[row_start:row_end, col_start:col_end]
    
    # Pad if necessary (when near edges)
    pad_width = (
        (max(0, half_patch - row), max(0, row + half_patch + 1 - height)),
        (max(0, half_patch - col), max(0, col + half_patch + 1 - width))
    )
    
    if any(p > 0 for p in sum(pad_width, ())):
        patch = np.pad(patch, pad_width, mode='constant', constant_values=0)
    
    return patch

# CNN Input: Stacked Raster Patches
X_cnn_input = {}
for station_id, point_row in sampling_points_gdf.iterrows():
    point_geom = point_row.geometry
    patches = []
    
    # Create a patch for each raster (DEM, LULC2022, all indices)
    for year in LULC_YEARS:
        try:
            lulc_array = lulc_rasters[year]
            lulc_profile = lulc_profiles[year]
            patch = extract_raster_patch(lulc_array, lulc_profile, point_geom, CNN_PATCH_SIZE)
            patches.append(patch)
        except Exception as e:
            print(f"Error extracting LULC {year} patch for station {station_id}: {str(e)}")
            patches.append(np.zeros((CNN_PATCH_SIZE, CNN_PATCH_SIZE)))
    
    try:
        dem_patch = extract_raster_patch(dem_array, dem_profile, point_geom, CNN_PATCH_SIZE)
        patches.append(dem_patch)
    except Exception as e:
        print(f"Error extracting DEM patch for station {station_id}: {str(e)}")
        patches.append(np.zeros((CNN_PATCH_SIZE, CNN_PATCH_SIZE)))
    
    for index_file, data in loaded_indices.items():
        try:
            index_patch = extract_raster_patch(data['array'], data['profile'], point_geom, CNN_PATCH_SIZE)
            patches.append(index_patch)
        except Exception as e:
            print(f"Error extracting {index_file} patch for station {station_id}: {str(e)}")
            patches.append(np.zeros((CNN_PATCH_SIZE, CNN_PATCH_SIZE)))
    
    # Stack the patches to create a multi-channel image input
    try:
        stacked_patches = np.stack(patches, axis=-1)
        # Handle NaN values if any patches failed to extract correctly
        stacked_patches = np.nan_to_num(stacked_patches, nan=0.0)
        X_cnn_input[station_id] = stacked_patches
    except Exception as e:
        print(f"Error stacking patches for station {station_id}: {str(e)}")
        X_cnn_input[station_id] = np.zeros((CNN_PATCH_SIZE, CNN_PATCH_SIZE, len(patches)))

# Convert to numpy array
X_cnn_input_list = np.array(list(X_cnn_input.values()))
print(f"CNN Input (Stacked Raster Patches) shape: {X_cnn_input_list.shape}")

CNN Input (Stacked Raster Patches) shape: (17, 33, 33, 18)


In [35]:
# GNN Input: Graph Structure and Node Features
# Create a proximity graph based on GNN_EDGE_DISTANCE_THRESHOLD_METERS
G = nx.Graph()
station_points = {station: point.coords[0] for station, point in sampling_points_gdf['geometry'].items()}

# Add nodes with their features
for station_id, features in X_tabular_numeric.iterrows():
    G.add_node(station_id, features=features.to_dict())

# Add edges based on proximity
station_ids = list(station_points.keys())
distances = cdist(list(station_points.values()), list(station_points.values()))
for i in range(len(station_ids)):
    for j in range(i + 1, len(station_ids)):
        if distances[i, j] <= GNN_EDGE_DISTANCE_THRESHOLD_METERS:
            G.add_edge(station_ids[i], station_ids[j], weight=distances[i, j])

print(f"GNN Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")


GNN Graph created with 17 nodes and 2 edges.


In [44]:
y_df

Unnamed: 0_level_0,CrR,NiR,CuR,AsR,CdR,PbR,FeR
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S1,92.69,19.18,40.34,7.96,2.66,50.73,26700
S2,88.4,17.21,41.56,9.88,2.97,38.9,34970
S3,66.92,37.52,49.47,15.48,2.1,32.79,23970
S4,55.56,26.08,69.77,18.77,1.79,43.4,23990
S5,64.5,30.62,73.19,20.96,1.45,53.55,35130
S6,60.35,11.72,98.28,10.42,0.82,37.88,38700
S7,69.37,14.95,96.72,19.88,0.97,33.47,43480
S8,15.7,40.76,73.87,11.36,2.17,98.1,45600
S9,12.74,39.43,87.2,10.78,3.6,170.4,47900
S10,11.13,47.01,80.78,18.9,2.79,108.6,50395


In [42]:
X_df = X_df.drop(columns=["River", "Lat", "Long"])
X_df.head()

Unnamed: 0_level_0,variation_2017-2018,variation_2018-2019,variation_2019-2020,variation_2020-2021,variation_2021-2022,dem_point_value,dem_mean_5km,dem_std_5km,dist_to_nearest_BF,dist_to_nearest_IND,...,ndsi_Mean_5km,ndsi_Std_5km,ndvi_Mean_5km,ndvi_Std_5km,ndwi_Mean_5km,ndwi_Std_5km,savi_Mean_5km,savi_Std_5km,ui_Mean_5km,ui_Std_5km
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S1,14.396884,8.830319,11.964628,10.162627,10.226519,-45.0,-44.633938,3.938536,2647.402248,522.326002,...,-0.258778,0.084377,0.326918,0.16653,-0.315127,0.139971,0.268463,0.138414,-0.196683,0.144285
S2,12.920722,9.676226,13.489252,10.422661,11.249439,-51.0,-45.282639,3.612707,3113.557901,3000.625941,...,-0.267662,0.083385,0.336557,0.167914,-0.325971,0.140518,0.278609,0.137878,-0.197708,0.148793
S3,11.966286,10.48311,15.459976,10.687158,11.446218,-52.0,-46.59,2.837222,1841.694727,1298.585986,...,-0.268019,0.086924,0.304957,0.186484,-0.299539,0.160875,0.256057,0.152663,-0.15111,0.159738
S4,21.385412,15.447605,20.766508,13.685776,13.207538,-53.0,-47.39642,2.501793,2737.503779,317.052883,...,-0.277133,0.084082,0.347748,0.18341,-0.331451,0.160435,0.293582,0.149312,-0.19633,0.147066
S5,18.108141,14.249969,17.320514,12.567463,11.587011,-47.0,-48.142467,2.406125,1637.673006,3307.441888,...,-0.27688,0.075112,0.333129,0.178004,-0.319351,0.150169,0.279081,0.14661,-0.18114,0.139715


In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# --- Placeholder Tabular Data (REPLACE THIS WITH YOUR ACTUAL DATA) ---
# This data is for demonstration and should be replaced with your loaded X_df and y_df.
# Example: 100 samples and 20 features
X_df = pd.DataFrame(np.random.rand(100, 20), columns=[f'feature_{i}' for i in range(20)])
y_df = pd.DataFrame(np.random.rand(100, 1) * 100, columns=['target'])

# --- Data Preparation ---
# CRITICAL FIX: Ensure X_df and y_df have the same number of samples
if len(X_df) != len(y_df):
    raise ValueError("X_df and y_df must have the same number of rows.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

# Scale the training and test features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Modeling with Tabular Data ---

# 1. Classical Machine Learning Model (Random Forest)
print("--- Training and Evaluating: Random Forest ---")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# The .ravel() call is correct for scikit-learn's target variable format
rf_model.fit(X_train_scaled, y_train.values.ravel())
rf_predictions = rf_model.predict(X_test_scaled)

print(f"R-squared: {r2_score(y_test, rf_predictions):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, rf_predictions):.4f}")

# 2. Hybrid ML + PMF Model (Conceptual)
print("\n--- Training and Evaluating: Hybrid RF + PMF ---")
# Replace this placeholder `pmf_factors` with your actual PMF results.
pmf_factors = np.random.rand(len(X_df), 3) # Assume 3 PMF factors as new features

# CRITICAL FIX: Ensure the number of rows in pmf_factors matches X_df
if len(X_df) != len(pmf_factors):
    raise ValueError("X_df and pmf_factors must have the same number of rows.")

X_hybrid = np.hstack([X_df.values, pmf_factors])

X_hybrid_train, X_hybrid_test, _, _ = train_test_split(X_hybrid, y_df, test_size=0.2, random_state=42)

rf_hybrid = RandomForestRegressor(n_estimators=100, random_state=42)
rf_hybrid.fit(X_hybrid_train, y_train.values.ravel())
rf_hybrid_predictions = rf_hybrid.predict(X_hybrid_test)

print(f"R-squared: {r2_score(y_test, rf_hybrid_predictions):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, rf_hybrid_predictions):.4f}")

--- Training and Evaluating: Random Forest ---
R-squared: -0.0732
Mean Squared Error: 764.1065

--- Training and Evaluating: Hybrid RF + PMF ---
R-squared: -0.1457
Mean Squared Error: 815.6963
