In [3]:
# notebooks/07_feature_engineering.ipynb

import pandas as pd
import geopandas as gpd
import numpy as np
import rasterio
from rasterstats import zonal_stats
from datetime import datetime
from abc import ABC, abstractmethod
from pathlib import Path

print("="*60)
print("KOLKATA FLOOD PREDICTION - FEATURE ENGINEERING")
print("="*60)

# ============================================
# MODULAR DATA SOURCE DESIGN
# ============================================

class FloodDataSource(ABC):
    """Abstract base class for flood data sources"""
    @abstractmethod
    def get_flood_labels(self, date_range=None):
        pass

class NewsBasedFloodData(FloodDataSource):
    """Current implementation using news-validated floods"""
    def __init__(self, csv_path):
        self.csv_path = csv_path
    
    def get_flood_labels(self, date_range=None):
        df = pd.read_csv(self.csv_path)
        df['date'] = pd.to_datetime(df['date'])
        if date_range:
            df = df[(df['date'] >= date_range[0]) & (df['date'] <= date_range[1])]
        return df

class SatelliteFloodData(FloodDataSource):
    """Future implementation using Sentinel-2"""
    def get_flood_labels(self, date_range=None):
        # TODO: Implement satellite-based flood detection
        raise NotImplementedError("Satellite integration coming in next phase!")

# ============================================
# DATA LOADING
# ============================================

print("\n1. Loading data assets...")

# Initialize flood data source
flood_source = NewsBasedFloodData('../data/processed/kolkata_flood_events_2023_2025.csv')

# Load FABDEM-processed ward statistics
print("   Loading FABDEM ward statistics...")
wards = gpd.read_file('../data/processed/kolkata_wards_fabdem_complete.gpkg')

# Clean ward IDs
wards['WARD'] = wards['WARD'].str.replace('\n', '').astype(int)

# Add log-transformed flow for better ML scaling
wards['flow_log'] = np.log10(wards['flow_fabdem_max'] + 1)

# Flag wetland interface zones (legitimate extreme flow)
wetland_interface_wards = [109, 108, 58, 107, 127]
wards['is_wetland_interface'] = wards['WARD'].isin(wetland_interface_wards).astype(int)

print(f"   ✓ Loaded {len(wards)} wards")
print(f"   ✓ Identified {sum(wards['is_wetland_interface'])} wetland interface wards")

# Load flood events
print("   Loading historical flood events...")
flood_events = flood_source.get_flood_labels()
print(f"   ✓ Loaded {len(flood_events)} flood records")
print(f"   ✓ Date range: {flood_events['date'].min().date()} to {flood_events['date'].max().date()}")

# ============================================
# RAINFALL DATA EXTRACTION
# ============================================

print("\n2. Processing rainfall data...")

# Check if rainfall file exists
rainfall_path = Path('../data/rainfall/kolkata_rainfall_2025_monsoon.tif')

if rainfall_path.exists():
    print("   Extracting rainfall statistics per ward...")
    
    # Extract stats for each monsoon month
    rain_stats = {}
    for band, month in enumerate(['june', 'july', 'aug', 'sept'], 1):
        stats = zonal_stats(
            wards.geometry, 
            str(rainfall_path), 
            band=band, 
            stats=['mean', 'max', 'std']
        )
        rain_stats[f'rain_{month}_mean'] = [s['mean'] if s else 0 for s in stats]
        rain_stats[f'rain_{month}_max'] = [s['max'] if s else 0 for s in stats]
        rain_stats[f'rain_{month}_std'] = [s['std'] if s else 0 for s in stats]
    
    # Add to ward dataframe
    for col, values in rain_stats.items():
        wards[col] = values
    
    print("   ✓ Added rainfall statistics for 4 months")
else:
    print("   ⚠ Rainfall file not found. Using synthetic data for now.")
    for month in ['june', 'july', 'aug', 'sept']:
        wards[f'rain_{month}_mean'] = np.random.uniform(200, 800, len(wards))
        wards[f'rain_{month}_max'] = np.random.uniform(300, 900, len(wards))
        wards[f'rain_{month}_std'] = np.random.uniform(50, 150, len(wards))

# ============================================
# CREATE ML TRAINING DATASET
# ============================================

print("\n3. Creating training dataset...")

training_samples = []

# For each flood event
for event_date in flood_events['date'].unique():
    event_floods = flood_events[flood_events['date'] == event_date]
    flooded_wards = set(event_floods[event_floods['flooded'] == 1]['ward'].values)
    
    # Map to appropriate rainfall month
    month_map = {5: 'may', 6: 'june', 7: 'july', 8: 'aug', 9: 'sept', 10: 'oct'}
    rain_month = month_map.get(event_date.month, 'sept')
    
    # Create sample for each ward
    for _, ward in wards.iterrows():
        ward_id = ward['WARD']
        
        # Build feature vector - NO MANUAL WEIGHTS!
        sample = {
            # Identifiers
            'ward_id': ward_id,
            'date': event_date,
            'year': event_date.year,
            'month': event_date.month,
            
            # Elevation features (FABDEM)
            'elevation_mean': ward.get('elev_fabdem_mean', 0),
            'elevation_min': ward.get('elev_fabdem_min', 0),
            'elevation_max': ward.get('elev_fabdem_max', 0),
            'elevation_range': ward.get('elev_fabdem_max', 0) - ward.get('elev_fabdem_min', 0),
            
            # Flow accumulation features
            'flow_max': ward.get('flow_fabdem_max', 0),
            'flow_mean': ward.get('flow_fabdem_mean', 0),
            'flow_sum': ward.get('flow_fabdem_sum', 0),
            'flow_log': ward.get('flow_log', 0),  # Log-scaled for ML
            
            # Rainfall features
            'rainfall_mean': ward.get(f'rain_{rain_month}_mean', 0),
            'rainfall_max': ward.get(f'rain_{rain_month}_max', 0),
            'rainfall_std': ward.get(f'rain_{rain_month}_std', 0),
            
            # Special flags
            'is_wetland_interface': ward.get('is_wetland_interface', 0),
            
            # Target variable
            'flooded': 1 if ward_id in flooded_wards else 0
        }
        training_samples.append(sample)

# Convert to DataFrame
df = pd.DataFrame(training_samples)
print(f"   ✓ Created {len(df)} training samples")

# ============================================
# DATA QUALITY ANALYSIS
# ============================================

print("\n4. Data quality check...")

# Class balance
flood_rate = df['flooded'].mean()
print(f"   Class balance: {flood_rate:.1%} positive (flooded)")
print(f"   Flood events: {df['flooded'].sum()}")
print(f"   Non-flood events: {(df['flooded'] == 0).sum()}")

# Feature correlations with flooding
print("\n5. Feature importance (correlation with flooding):")
print("-" * 50)

feature_cols = [
    'elevation_mean', 'elevation_min', 'elevation_range',
    'flow_max', 'flow_mean', 'flow_log',
    'rainfall_mean', 'rainfall_max',
    'is_wetland_interface'
]

correlations = df[feature_cols + ['flooded']].corr()['flooded'].sort_values(ascending=False)
for feat, corr in correlations.items():
    if feat != 'flooded':
        print(f"   {feat:25s}: {corr:+.3f}")

# ============================================
# SPECIAL ANALYSIS: HIGH-RISK WARDS
# ============================================

print("\n6. High-risk ward analysis...")

# Wards that flood most frequently
flood_frequency = df.groupby('ward_id')['flooded'].mean().sort_values(ascending=False)
print("\n   Top 10 most frequently flooded wards:")
for ward, freq in flood_frequency.head(10).items():
    special_note = " [Wetland Interface]" if ward in wetland_interface_wards else ""
    print(f"   Ward {ward:3d}: {freq:5.1%} flood rate{special_note}")

# Ward 109 special analysis (extreme flow accumulation)
ward_109_data = df[df['ward_id'] == 109]
if not ward_109_data.empty:
    print("\n   Ward 109 Special Analysis (Extreme Flow Zone):")
    print(f"   - Flow accumulation: {ward_109_data['flow_max'].iloc[0]:,.0f} (legitimate - wetland drainage)")
    print(f"   - Elevation: {ward_109_data['elevation_mean'].iloc[0]:.2f}m (lowest in city)")
    print(f"   - Flood rate: {ward_109_data['flooded'].mean():.1%}")

# ============================================
# SAVE DATASETS
# ============================================

print("\n7. Saving processed data...")

# Save full training dataset
output_path = '../data/processed/ml_training_data.csv'
df.to_csv(output_path, index=False)
print(f"   ✓ Saved training data: {output_path}")

# Save feature matrix only (for ML modeling)
feature_matrix = df[feature_cols + ['flooded', 'ward_id', 'date']]
feature_matrix.to_csv('../data/processed/feature_matrix.csv', index=False)
print(f"   ✓ Saved feature matrix: {len(feature_cols)} features")

# Save ward static features for predictions
ward_features = wards[['WARD', 'elev_fabdem_mean', 'elev_fabdem_min', 
                       'flow_fabdem_max', 'flow_fabdem_mean', 'flow_log',
                       'is_wetland_interface']]
ward_features.to_csv('../data/processed/ward_static_features.csv', index=False)
print(f"   ✓ Saved ward static features")

# ============================================
# SUMMARY STATISTICS
# ============================================

print("\n" + "="*60)
print("FEATURE ENGINEERING COMPLETE")
print("="*60)

print(f"""
Summary:
- Total samples: {len(df):,}
- Unique dates: {df['date'].nunique()}
- Unique wards: {df['ward_id'].nunique()}
- Features: {len(feature_cols)}
- Target balance: {flood_rate:.1%} positive

Ready for:
1. Baseline ML model (Random Forest)
2. Satellite data integration 
3. Deep learning model training

Next step: Run notebook 09_sentinel_extraction.ipynb
""")

KOLKATA FLOOD PREDICTION - FEATURE ENGINEERING

1. Loading data assets...
   Loading FABDEM ward statistics...
   ✓ Loaded 141 wards
   ✓ Identified 5 wetland interface wards
   Loading historical flood events...
   ✓ Loaded 184 flood records
   ✓ Date range: 2023-09-02 to 2025-10-10

2. Processing rainfall data...
   Extracting rainfall statistics per ward...




   ✓ Added rainfall statistics for 4 months

3. Creating training dataset...
   ✓ Created 1128 training samples

4. Data quality check...
   Class balance: 9.2% positive (flooded)
   Flood events: 104
   Non-flood events: 1024

5. Feature importance (correlation with flooding):
--------------------------------------------------
   rainfall_mean            : +0.138
   rainfall_max             : +0.137
   flow_log                 : +0.131
   elevation_range          : +0.055
   flow_max                 : +0.038
   is_wetland_interface     : +0.022
   flow_mean                : +0.005
   elevation_mean           : -0.072
   elevation_min            : -0.085

6. High-risk ward analysis...

   Top 10 most frequently flooded wards:
   Ward 130: 87.5% flood rate
   Ward  68: 87.5% flood rate
   Ward  93: 87.5% flood rate
   Ward  66: 75.0% flood rate
   Ward  65: 75.0% flood rate
   Ward  73: 62.5% flood rate
   Ward  96: 62.5% flood rate
   Ward  91: 62.5% flood rate
   Ward  99: 62.5% flood

In [2]:
!pip install rasterstats

Collecting rasterstats
  Downloading rasterstats-0.20.0-py3-none-any.whl.metadata (4.2 kB)
Collecting fiona (from rasterstats)
  Downloading fiona-1.10.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (56 kB)
Collecting simplejson (from rasterstats)
  Downloading simplejson-3.20.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (3.4 kB)
Downloading rasterstats-0.20.0-py3-none-any.whl (17 kB)
Downloading fiona-1.10.1-cp313-cp313-macosx_11_0_arm64.whl (14.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.7/14.7 MB[0m [31m31.7 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hDownloading simplejson-3.20.2-cp313-cp313-macosx_11_0_arm64.whl (75 kB)
Installing collected packages: simplejson, fiona, rasterstats
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [rasterstats][0m [rasterstats]
[1A[2KSuccessfully installed fiona-1.10.1 rasterstats-0.20.0 simplejson-3.20.2
