In [2]:
import geopandas as gpd
import pandas as pd

# Load the new statistics
wards = gpd.read_file('../data/processed/kolkata_wards_fabdem_complete.gpkg')

print("Columns in dataset:")
print(wards.columns.tolist())

# Check the stats
stats_summary = wards[['WARD', 'elev_fabdem_mean', 'flow_fabdem_max']].describe()
print("\nStatistics summary:")
print(stats_summary)

# Find most flood-prone wards based on new data
wards['flood_risk_score'] = (
    (1 - wards['elev_fabdem_mean']/wards['elev_fabdem_mean'].max()) * 0.5 +
    (wards['flow_fabdem_max']/wards['flow_fabdem_max'].max()) * 0.5
)

print("\nTop 10 highest risk wards (FABDEM-based):")
top_risk = wards.nlargest(10, 'flood_risk_score')[['WARD', 'elev_fabdem_mean', 'flow_fabdem_max', 'flood_risk_score']]
print(top_risk)

# Do these match our known flooding wards?
known_flood_wards = [93, 66, 68, 107, 111, 130, 73, 91, 65, 108]
print(f"\nKnown flood wards in top risk: {[w for w in known_flood_wards if w in top_risk['WARD'].values]}")

Columns in dataset:
['WARD', 'elev_fabdem_mean', 'elev_fabdem_min', 'elev_fabdem_max', 'flow_fabdem_sum', 'flow_fabdem_mean', 'flow_fabdem_max', 'geometry']

Statistics summary:
       elev_fabdem_mean  flow_fabdem_max
count        141.000000       141.000000
mean           6.091961      5643.602340
std            1.849904     10554.260188
min            1.861379        64.989901
25%            4.628756       831.961169
50%            6.158298      2225.461107
75%            7.246805      5657.916676
max           12.585321     85685.108132

Top 10 highest risk wards (FABDEM-based):
      WARD  elev_fabdem_mean  flow_fabdem_max  flood_risk_score
49   109\n          1.861379     85685.108132          0.926050
17   108\n          1.905779     56988.392796          0.756831
58   127\n          3.715842     50692.749415          0.648182
119  107\n          2.510570     16865.035639          0.498671
69   124\n          3.761244     15204.973323          0.439296
140   58\n          3.8580

In [3]:
# Investigate the outlier
import numpy as np

# Check if it's a statistical outlier
Q1 = wards['flow_fabdem_max'].quantile(0.25)
Q3 = wards['flow_fabdem_max'].quantile(0.75)
IQR = Q3 - Q1
outlier_threshold = Q3 + 3 * IQR

print(f"Outlier threshold: {outlier_threshold:.0f}")
print(f"Ward 109 value: {wards[wards['WARD'] == 109]['flow_fabdem_max'].values[0]:.0f}")

# Check other statistics for Ward 109
ward_109 = wards[wards['WARD'] == 109]
print(f"\nWard 109 details:")
print(f"Elevation: {ward_109['elev_fabdem_mean'].values[0]:.2f}m")
print(f"Flow mean: {ward_109['flow_fabdem_mean'].values[0]:.0f}")
print(f"Flow max: {ward_109['flow_fabdem_max'].values[0]:.0f}")

# Compare with neighbors
neighbors = [108, 110, 107]  # Assuming these are nearby
for w in neighbors:
    w_data = wards[wards['WARD'] == w]
    if not w_data.empty:
        print(f"Ward {w}: flow_max = {w_data['flow_fabdem_max'].values[0]:.0f}")

Outlier threshold: 20136


IndexError: index 0 is out of bounds for axis 0 with size 0

In [4]:
# First, let's see what's actually in the WARD column
print("Unique ward values (first 10):")
print(sorted(wards['WARD'].unique())[:10])

# The issue is the \n characters - let's clean properly
wards['WARD'] = wards['WARD'].str.replace('\n', '').astype(int)

# Now check Ward 109
ward_109_data = wards[wards['WARD'] == 109]
if not ward_109_data.empty:
    print(f"\nWard 109 flow_max: {ward_109_data['flow_fabdem_max'].values[0]:.0f}")
else:
    print("Ward 109 not found in dataset")

# Check which wards have highest flow
print("\nTop 5 wards by flow accumulation:")
top_flow = wards.nlargest(5, 'flow_fabdem_max')[['WARD', 'flow_fabdem_max']]
print(top_flow)

Unique ward values (first 10):
['1\n', '10\n', '100\n', '101\n', '102\n', '103\n', '104\n', '105\n', '106\n', '107\n']

Ward 109 flow_max: 85685

Top 5 wards by flow accumulation:
    WARD  flow_fabdem_max
49   109     85685.108132
17   108     56988.392796
58   127     50692.749415
85    21     29114.367130
32    75     26775.713509


In [5]:
# Don't cap! Instead, use log transformation for modeling
import numpy as np

# Add a feature flag for extreme drainage zones
wards['is_wetland_interface'] = wards['WARD'].isin([109, 108, 58, 107])

# Log transform for model training (preserves relationships)
wards['flow_log'] = np.log10(wards['flow_fabdem_max'] + 1)

# Keep original for risk assessment
print("Critical drainage wards (wetland interface):")
critical_wards = wards[wards['flow_fabdem_max'] > 30000][['WARD', 'flow_fabdem_max', 'elev_fabdem_mean']]
print(critical_wards)

# These wards SHOULD show extreme flood risk
print("\nWard 109 characteristics:")
print("- Natural drainage collector for eastern Kolkata")
print("- Gateway to East Kolkata Wetlands")
print("- Extreme flood risk is REAL, not an artifact")

Critical drainage wards (wetland interface):
    WARD  flow_fabdem_max  elev_fabdem_mean
17   108     56988.392796          1.905779
49   109     85685.108132          1.861379
58   127     50692.749415          3.715842

Ward 109 characteristics:
- Natural drainage collector for eastern Kolkata
- Gateway to East Kolkata Wetlands
- Extreme flood risk is REAL, not an artifact
