In [3]:
import ee
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
import time

# Initialize Earth Engine
ee.Initialize(project='kolkata-flood-mapping')

# Paths
DATA_DIR = Path("../data")
PROCESSED_DIR = DATA_DIR / "processed"
MODIS_DIR = DATA_DIR / "modis"
MODIS_DIR.mkdir(exist_ok=True)

# Load ward boundaries from GEE asset
wards_fc = ee.FeatureCollection("projects/kolkata-flood-mapping/assets/kmc_wards")

print(f"Loaded {wards_fc.size().getInfo()} wards from GEE asset")

# Get ward IDs
ward_ids = wards_fc.aggregate_array('WARD').getInfo()
print(f"Ward column: WARD")
print(f"Sample ward IDs: {ward_ids[:5]}")

# Date range
START_DATE = '2014-01-01'
END_DATE = '2025-11-24'

print(f"Date range: {START_DATE} to {END_DATE}")
print("Setup complete!")

Loaded 141 wards from GEE asset
Ward column: WARD
Sample ward IDs: ['93', '61', '86', '90', '26']
Date range: 2014-01-01 to 2025-11-24
Setup complete!


In [4]:
# MODIS Vegetation Indices (250m, 16-day)
modis_vi = ee.ImageCollection("MODIS/061/MOD13Q1") \
    .filterDate(START_DATE, END_DATE) \
    .filterBounds(wards_fc)

# MODIS Land Surface Temperature (1km, 8-day)
modis_lst = ee.ImageCollection("MODIS/061/MOD11A2") \
    .filterDate(START_DATE, END_DATE) \
    .filterBounds(wards_fc)

print(f"MODIS VI images: {modis_vi.size().getInfo()}")
print(f"MODIS LST images: {modis_lst.size().getInfo()}")

MODIS VI images: 273
MODIS LST images: 546


In [5]:
def extract_vi_for_ward(image, ward_feature):
    """Extract NDVI and EVI for a single ward from a single image."""
    # Scale factors
    ndvi = image.select('NDVI').multiply(0.0001)
    evi = image.select('EVI').multiply(0.0001)

    # Get date
    date = ee.Date(image.get('system:time_start')).format('YYYY-MM-dd')

    # Calculate stats for ward
    stats = ndvi.addBands(evi).reduceRegion(
        reducer=ee.Reducer.mean().combine(
            reducer2=ee.Reducer.stdDev(),
            sharedInputs=True
        ),
        geometry=ward_feature.geometry(),
        scale=250,
        maxPixels=1e9
    )

    return ee.Feature(None, {
        'ward_id': ward_feature.get('ward_id'),
        'date': date,
        'ndvi_mean': stats.get('NDVI_mean'),
        'ndvi_std': stats.get('NDVI_stdDev'),
        'evi_mean': stats.get('EVI_mean'),
        'evi_std': stats.get('EVI_stdDev')
    })


def extract_lst_for_ward(image, ward_feature):
    """Extract LST for a single ward from a single image."""
    # Scale factor and convert to Celsius
    lst_day = image.select('LST_Day_1km').multiply(0.02).subtract(273.15)
    lst_night = image.select('LST_Night_1km').multiply(0.02).subtract(273.15)

    date = ee.Date(image.get('system:time_start')).format('YYYY-MM-dd')

    stats = lst_day.addBands(lst_night).reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=ward_feature.geometry(),
        scale=1000,
        maxPixels=1e9
    )

    return ee.Feature(None, {
        'ward_id': ward_feature.get('ward_id'),
        'date': date,
        'lst_day_c': stats.get('LST_Day_1km'),
        'lst_night_c': stats.get('LST_Night_1km')
    })

In [9]:
def extract_vi_for_ward(image, ward_feature):
    """Extract NDVI and EVI for a single ward from a single image."""
    # Scale factors
    ndvi = image.select('NDVI').multiply(0.0001)
    evi = image.select('EVI').multiply(0.0001)

    # Get date
    date = ee.Date(image.get('system:time_start')).format('YYYY-MM-dd')

    # Calculate stats for ward
    stats = ndvi.addBands(evi).reduceRegion(
        reducer=ee.Reducer.mean().combine(
            reducer2=ee.Reducer.stdDev(),
            sharedInputs=True
        ),
        geometry=ward_feature.geometry(),
        scale=250,
        maxPixels=1e9
    )

    return ee.Feature(None, {
        'ward_id': ward_feature.get('WARD'),  # Fixed: WARD not ward_id
        'date': date,
        'ndvi_mean': stats.get('NDVI_mean'),
        'ndvi_std': stats.get('NDVI_stdDev'),
        'evi_mean': stats.get('EVI_mean'),
        'evi_std': stats.get('EVI_stdDev')
    })


def extract_lst_for_ward(image, ward_feature):
    """Extract LST for a single ward from a single image."""
    # Scale factor and convert to Celsius
    lst_day = image.select('LST_Day_1km').multiply(0.02).subtract(273.15)
    lst_night = image.select('LST_Night_1km').multiply(0.02).subtract(273.15)

    date = ee.Date(image.get('system:time_start')).format('YYYY-MM-dd')

    stats = lst_day.addBands(lst_night).reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=ward_feature.geometry(),
        scale=1000,
        maxPixels=1e9
    )

    return ee.Feature(None, {
        'ward_id': ward_feature.get('WARD'),  # Fixed: WARD not ward_id
        'date': date,
        'lst_day_c': stats.get('LST_Day_1km'),
        'lst_night_c': stats.get('LST_Night_1km')
    })

In [7]:
def extract_modis_batch(image_collection, wards_fc, extract_func, batch_name):
    """Extract MODIS data for all wards across all images."""

    images = image_collection.toList(image_collection.size())
    n_images = image_collection.size().getInfo()
    wards_list = wards_fc.toList(wards_fc.size())
    n_wards = wards_fc.size().getInfo()

    print(f"Extracting {batch_name}: {n_images} images × {n_wards} wards")

    all_results = []

    # Process in batches of images
    batch_size = 10
    for i in range(0, n_images, batch_size):
        batch_end = min(i + batch_size, n_images)
        print(f"  Processing images {i+1}-{batch_end} of {n_images}...", end=" ")

        batch_features = []
        for img_idx in range(i, batch_end):
            image = ee.Image(images.get(img_idx))

            for ward_idx in range(n_wards):
                ward = ee.Feature(wards_list.get(ward_idx))
                feature = extract_func(image, ward)
                batch_features.append(feature)

        # Get batch results
        batch_fc = ee.FeatureCollection(batch_features)

        try:
            batch_results = batch_fc.getInfo()
            for feat in batch_results['features']:
                all_results.append(feat['properties'])
            print(f"✓ ({len(batch_results['features'])} records)")
        except Exception as e:
            print(f"✗ Error: {e}")

        time.sleep(1)  # Rate limiting

    return pd.DataFrame(all_results)

In [10]:
# Test with first 5 images and first 5 wards
test_vi = modis_vi.limit(5)
test_wards = wards_fc.limit(5)

print("Testing VI extraction...")
test_results = []

images = test_vi.toList(5)
wards = test_wards.toList(5)

for i in range(5):
    image = ee.Image(images.get(i))
    for j in range(5):
        ward = ee.Feature(wards.get(j))
        result = extract_vi_for_ward(image, ward)
        test_results.append(result)

test_fc = ee.FeatureCollection(test_results)
test_df = pd.DataFrame([f['properties'] for f in test_fc.getInfo()['features']])

print(f"Test results: {len(test_df)} rows")
print(test_df.head(10))

Testing VI extraction...
Test results: 25 rows
         date  evi_mean   evi_std  ndvi_mean  ndvi_std ward_id
0  2014-01-01  0.101786  0.037788   0.192013  0.062986      93
1  2014-01-01  0.050275  0.027720   0.114576  0.065240      61
2  2014-01-01  0.080727  0.018712   0.167369  0.038124      86
3  2014-01-01  0.091414  0.025776   0.224127  0.062606      90
4  2014-01-01  0.066318  0.011176   0.154721  0.014236      26
5  2014-01-17  0.092438  0.024776   0.206995  0.043226      93
6  2014-01-17  0.060769  0.013672   0.150419  0.050002      61
7  2014-01-17  0.072477  0.027344   0.175016  0.066419      86
8  2014-01-17  0.078603  0.032562   0.191611  0.077028      90
9  2014-01-17  0.059768  0.009832   0.164793  0.025171      26


In [11]:
# Full MODIS VI extraction
print("=" * 60)
print("EXTRACTING MODIS VEGETATION INDICES (NDVI/EVI)")
print("=" * 60)

images = modis_vi.toList(modis_vi.size())
n_images = modis_vi.size().getInfo()
wards_list = wards_fc.toList(wards_fc.size())
n_wards = wards_fc.size().getInfo()

print(f"Images: {n_images}")
print(f"Wards: {n_wards}")
print(f"Total extractions: {n_images * n_wards:,}")

all_results = []
start_time = time.time()

# Process image by image
for i in range(n_images):
    image = ee.Image(images.get(i))

    # Extract for all wards at once
    batch_features = []
    for j in range(n_wards):
        ward = ee.Feature(wards_list.get(j))
        feature = extract_vi_for_ward(image, ward)
        batch_features.append(feature)

    batch_fc = ee.FeatureCollection(batch_features)

    try:
        batch_results = batch_fc.getInfo()
        for feat in batch_results['features']:
            all_results.append(feat['properties'])

        # Progress update every 10 images
        if (i + 1) % 10 == 0 or i == 0:
            elapsed = time.time() - start_time
            rate = (i + 1) / elapsed * 60  # images per minute
            remaining = (n_images - i - 1) / rate if rate > 0 else 0
            print(f"  Image {i+1}/{n_images} | {len(all_results):,} records | {elapsed/60:.1f}min elapsed | ~{remaining:.0f}min remaining")

    except Exception as e:
        print(f"  Error on image {i+1}: {e}")

    time.sleep(0.5)  # Rate limiting

# Convert to DataFrame
vi_df = pd.DataFrame(all_results)

elapsed_total = time.time() - start_time
print(f"\n{'=' * 60}")
print(f"VI extraction complete!")
print(f"Total records: {len(vi_df):,}")
print(f"Time: {elapsed_total/60:.1f} minutes")

# Save
vi_df.to_csv(MODIS_DIR / "modis_vi_raw.csv", index=False)
print(f"Saved: {MODIS_DIR / 'modis_vi_raw.csv'}")

EXTRACTING MODIS VEGETATION INDICES (NDVI/EVI)
Images: 273
Wards: 141
Total extractions: 38,493
  Image 1/273 | 141 records | 0.0min elapsed | ~7min remaining
  Image 10/273 | 1,410 records | 0.3min elapsed | ~7min remaining
  Image 20/273 | 2,820 records | 0.5min elapsed | ~7min remaining
  Image 30/273 | 4,230 records | 0.8min elapsed | ~6min remaining
  Image 40/273 | 5,640 records | 1.1min elapsed | ~6min remaining
  Image 50/273 | 7,050 records | 1.3min elapsed | ~6min remaining
  Image 60/273 | 8,460 records | 1.6min elapsed | ~6min remaining
  Image 70/273 | 9,870 records | 1.9min elapsed | ~5min remaining
  Image 80/273 | 11,280 records | 2.1min elapsed | ~5min remaining
  Image 90/273 | 12,690 records | 2.4min elapsed | ~5min remaining
  Image 100/273 | 14,100 records | 2.6min elapsed | ~5min remaining
  Image 110/273 | 15,510 records | 2.9min elapsed | ~4min remaining
  Image 120/273 | 16,920 records | 3.2min elapsed | ~4min remaining
  Image 130/273 | 18,330 records | 3.4min

In [12]:
# Full MODIS LST extraction
print("=" * 60)
print("EXTRACTING MODIS LAND SURFACE TEMPERATURE (LST)")
print("=" * 60)

images_lst = modis_lst.toList(modis_lst.size())
n_images_lst = modis_lst.size().getInfo()

print(f"Images: {n_images_lst}")
print(f"Wards: {n_wards}")
print(f"Total extractions: {n_images_lst * n_wards:,}")

all_lst_results = []
start_time = time.time()

for i in range(n_images_lst):
    image = ee.Image(images_lst.get(i))

    batch_features = []
    for j in range(n_wards):
        ward = ee.Feature(wards_list.get(j))
        feature = extract_lst_for_ward(image, ward)
        batch_features.append(feature)

    batch_fc = ee.FeatureCollection(batch_features)

    try:
        batch_results = batch_fc.getInfo()
        for feat in batch_results['features']:
            all_lst_results.append(feat['properties'])

        if (i + 1) % 20 == 0 or i == 0:
            elapsed = time.time() - start_time
            rate = (i + 1) / elapsed * 60
            remaining = (n_images_lst - i - 1) / rate if rate > 0 else 0
            print(f"  Image {i+1}/{n_images_lst} | {len(all_lst_results):,} records | {elapsed/60:.1f}min elapsed | ~{remaining:.0f}min remaining")

    except Exception as e:
        print(f"  Error on image {i+1}: {e}")

    time.sleep(0.5)

lst_df = pd.DataFrame(all_lst_results)

elapsed_total = time.time() - start_time
print(f"\n{'=' * 60}")
print(f"LST extraction complete!")
print(f"Total records: {len(lst_df):,}")
print(f"Time: {elapsed_total/60:.1f} minutes")

lst_df.to_csv(MODIS_DIR / "modis_lst_raw.csv", index=False)
print(f"Saved: {MODIS_DIR / 'modis_lst_raw.csv'}")

EXTRACTING MODIS LAND SURFACE TEMPERATURE (LST)
Images: 546
Wards: 141
Total extractions: 76,986
  Image 1/546 | 141 records | 0.0min elapsed | ~13min remaining
  Image 20/546 | 2,820 records | 0.6min elapsed | ~15min remaining
  Image 40/546 | 5,640 records | 1.1min elapsed | ~14min remaining
  Image 60/546 | 8,460 records | 1.7min elapsed | ~14min remaining
  Image 80/546 | 11,280 records | 2.2min elapsed | ~13min remaining
  Image 100/546 | 14,100 records | 2.7min elapsed | ~12min remaining
  Image 120/546 | 16,920 records | 3.3min elapsed | ~12min remaining
  Image 140/546 | 19,740 records | 3.8min elapsed | ~11min remaining
  Image 160/546 | 22,560 records | 4.4min elapsed | ~11min remaining
  Image 180/546 | 25,380 records | 5.0min elapsed | ~10min remaining
  Image 200/546 | 28,200 records | 5.5min elapsed | ~10min remaining
  Image 220/546 | 31,020 records | 6.1min elapsed | ~9min remaining
  Image 240/546 | 33,840 records | 6.6min elapsed | ~8min remaining
  Image 260/546 | 36

In [13]:
# Load raw MODIS data
vi_df = pd.read_csv(MODIS_DIR / "modis_vi_raw.csv")
lst_df = pd.read_csv(MODIS_DIR / "modis_lst_raw.csv")

print("MODIS VI (Vegetation Indices):")
print(f"  Shape: {vi_df.shape}")
print(f"  Date range: {vi_df['date'].min()} to {vi_df['date'].max()}")
print(f"  Columns: {list(vi_df.columns)}")

print("\nMODIS LST (Land Surface Temperature):")
print(f"  Shape: {lst_df.shape}")
print(f"  Date range: {lst_df['date'].min()} to {lst_df['date'].max()}")
print(f"  Columns: {list(lst_df.columns)}")

# Check for nulls
print(f"\nVI nulls:\n{vi_df.isnull().sum()}")
print(f"\nLST nulls:\n{lst_df.isnull().sum()}")

MODIS VI (Vegetation Indices):
  Shape: (38493, 6)
  Date range: 2014-01-01 to 2025-11-01
  Columns: ['date', 'evi_mean', 'evi_std', 'ndvi_mean', 'ndvi_std', 'ward_id']

MODIS LST (Land Surface Temperature):
  Shape: (76986, 4)
  Date range: 2014-01-01 to 2025-11-09
  Columns: ['date', 'lst_day_c', 'lst_night_c', 'ward_id']

VI nulls:
date         0
evi_mean     0
evi_std      0
ndvi_mean    0
ndvi_std     0
ward_id      0
dtype: int64

LST nulls:
date               0
lst_day_c      18593
lst_night_c    17920
ward_id            0
dtype: int64


In [14]:
# Convert dates
vi_df['date'] = pd.to_datetime(vi_df['date'])
lst_df['date'] = pd.to_datetime(lst_df['date'])

# LST is 8-day, VI is 16-day - need to align
# Strategy: For each VI date, find closest LST observation

print("Merging VI and LST data...")

# First, let's see the temporal mismatch
print(f"VI unique dates: {vi_df['date'].nunique()}")
print(f"LST unique dates: {lst_df['date'].nunique()}")

# Merge using merge_asof (nearest date match)
vi_df = vi_df.sort_values(['ward_id', 'date'])
lst_df = lst_df.sort_values(['ward_id', 'date'])

# For each ward, merge LST to nearest VI date
modis_merged = []

for ward in vi_df['ward_id'].unique():
    vi_ward = vi_df[vi_df['ward_id'] == ward].copy()
    lst_ward = lst_df[lst_df['ward_id'] == ward].copy()

    merged = pd.merge_asof(
        vi_ward,
        lst_ward[['date', 'lst_day_c', 'lst_night_c']],
        on='date',
        direction='nearest',
        tolerance=pd.Timedelta('8 days')
    )
    modis_merged.append(merged)

modis_df = pd.concat(modis_merged, ignore_index=True)

print(f"Merged shape: {modis_df.shape}")
print(f"Columns: {list(modis_df.columns)}")
modis_df.head()

Merging VI and LST data...
VI unique dates: 273
LST unique dates: 546
Merged shape: (38493, 8)
Columns: ['date', 'evi_mean', 'evi_std', 'ndvi_mean', 'ndvi_std', 'ward_id', 'lst_day_c', 'lst_night_c']


Unnamed: 0,date,evi_mean,evi_std,ndvi_mean,ndvi_std,ward_id,lst_day_c,lst_night_c
0,2014-01-01,0.091977,0.038423,0.206405,0.088525,1,23.862126,16.226782
1,2014-01-17,0.066463,0.04134,0.154803,0.08844,1,23.918448,17.123448
2,2014-02-02,0.087158,0.031568,0.190124,0.068027,1,27.872069,18.661954
3,2014-02-18,0.088154,0.042096,0.227033,0.064462,1,27.079425,19.202299
4,2014-03-06,0.099543,0.038788,0.240397,0.101015,1,32.029598,23.119023


In [15]:
# Create derived features
print("Engineering MODIS features...")

modis_df = modis_df.sort_values(['ward_id', 'date'])

# 1. Temporal lags (previous observation)
for col in ['ndvi_mean', 'evi_mean', 'lst_day_c']:
    modis_df[f'{col}_prev'] = modis_df.groupby('ward_id')[col].shift(1)
    modis_df[f'{col}_change'] = modis_df[col] - modis_df[f'{col}_prev']

# 2. LST day-night difference (urban heat indicator)
modis_df['lst_diurnal_range'] = modis_df['lst_day_c'] - modis_df['lst_night_c']

# 3. Vegetation anomaly (deviation from ward's historical mean)
ward_means = modis_df.groupby('ward_id')[['ndvi_mean', 'evi_mean']].transform('mean')
modis_df['ndvi_anomaly'] = modis_df['ndvi_mean'] - ward_means['ndvi_mean']
modis_df['evi_anomaly'] = modis_df['evi_mean'] - ward_means['evi_mean']

# 4. Seasonal indicators
modis_df['month'] = modis_df['date'].dt.month
modis_df['is_monsoon'] = modis_df['month'].isin([6, 7, 8, 9]).astype(int)

print(f"Features created: {list(modis_df.columns)}")
print(f"Shape: {modis_df.shape}")

Engineering MODIS features...
Features created: ['date', 'evi_mean', 'evi_std', 'ndvi_mean', 'ndvi_std', 'ward_id', 'lst_day_c', 'lst_night_c', 'ndvi_mean_prev', 'ndvi_mean_change', 'evi_mean_prev', 'evi_mean_change', 'lst_day_c_prev', 'lst_day_c_change', 'lst_diurnal_range', 'ndvi_anomaly', 'evi_anomaly', 'month', 'is_monsoon']
Shape: (38493, 19)


In [16]:
# MODIS is 16-day composite, but our model needs daily features
# Strategy: Forward-fill to create daily values

print("Interpolating to daily frequency...")

daily_modis = []

for ward in modis_df['ward_id'].unique():
    ward_data = modis_df[modis_df['ward_id'] == ward].set_index('date')

    # Create daily date range
    daily_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D')

    # Reindex to daily and forward-fill
    ward_daily = ward_data.reindex(daily_range).ffill()
    ward_daily['ward_id'] = ward
    ward_daily['date'] = ward_daily.index

    daily_modis.append(ward_daily.reset_index(drop=True))

modis_daily = pd.concat(daily_modis, ignore_index=True)

print(f"Daily MODIS shape: {modis_daily.shape}")
print(f"Date range: {modis_daily['date'].min()} to {modis_daily['date'].max()}")
print(f"Expected rows: {141 * 3981} (141 wards × ~3981 days)")

Interpolating to daily frequency...
Daily MODIS shape: (612786, 19)
Date range: 2014-01-01 00:00:00 to 2025-11-24 00:00:00
Expected rows: 561321 (141 wards × ~3981 days)


In [17]:
# Select final feature columns
modis_feature_cols = [
    'date', 'ward_id',
    'ndvi_mean', 'ndvi_std', 'ndvi_anomaly', 'ndvi_mean_change',
    'evi_mean', 'evi_std', 'evi_anomaly', 'evi_mean_change',
    'lst_day_c', 'lst_night_c', 'lst_diurnal_range', 'lst_day_c_change',
    'is_monsoon'
]

# Keep only columns that exist
modis_feature_cols = [c for c in modis_feature_cols if c in modis_daily.columns]

modis_features = modis_daily[modis_feature_cols].copy()

# Save
modis_features.to_csv(PROCESSED_DIR / "modis_features_daily.csv", index=False)

print(f"Saved: {PROCESSED_DIR / 'modis_features_daily.csv'}")
print(f"Shape: {modis_features.shape}")
print(f"Columns: {list(modis_features.columns)}")

# Summary stats
print("\n--- Feature Summary ---")
for col in modis_features.columns:
    if col not in ['date', 'ward_id']:
        print(f"{col}: mean={modis_features[col].mean():.3f}, std={modis_features[col].std():.3f}")

Saved: ../data/processed/modis_features_daily.csv
Shape: (612786, 15)
Columns: ['date', 'ward_id', 'ndvi_mean', 'ndvi_std', 'ndvi_anomaly', 'ndvi_mean_change', 'evi_mean', 'evi_std', 'evi_anomaly', 'evi_mean_change', 'lst_day_c', 'lst_night_c', 'lst_diurnal_range', 'lst_day_c_change', 'is_monsoon']

--- Feature Summary ---
ndvi_mean: mean=0.292, std=0.091
ndvi_std: mean=0.062, std=0.036
ndvi_anomaly: mean=0.000, std=0.067
ndvi_mean_change: mean=0.000, std=0.074
evi_mean: mean=0.158, std=0.056
evi_std: mean=0.040, std=0.025
evi_anomaly: mean=0.000, std=0.039
evi_mean_change: mean=0.000, std=0.040
lst_day_c: mean=31.437, std=4.583
lst_night_c: mean=24.116, std=3.623
lst_diurnal_range: mean=7.484, std=2.605
lst_day_c_change: mean=-0.165, std=2.923
is_monsoon: mean=0.353, std=0.478


In [18]:
# Load MODIS features
modis_features = pd.read_csv(PROCESSED_DIR / "modis_features_daily.csv")
modis_features['date'] = pd.to_datetime(modis_features['date'])

print(f"MODIS features: {modis_features.shape}")

# Load ward-level training data
train_ward = pd.read_csv(PROCESSED_DIR / "ward_level_training_data.csv")
train_ward['date'] = pd.to_datetime(train_ward['date'])

print(f"Training data: {train_ward.shape}")

# Merge MODIS features
train_with_modis = train_ward.merge(
    modis_features,
    on=['date', 'ward_id'],
    how='left'
)

print(f"After MODIS merge: {train_with_modis.shape}")
print(f"New columns: {[c for c in train_with_modis.columns if c not in train_ward.columns]}")

MODIS features: (612786, 15)
Training data: (38916, 122)
After MODIS merge: (38916, 135)
New columns: ['ndvi_mean', 'ndvi_std', 'ndvi_anomaly', 'ndvi_mean_change', 'evi_mean', 'evi_std', 'evi_anomaly', 'evi_mean_change', 'lst_day_c', 'lst_night_c', 'lst_diurnal_range', 'lst_day_c_change', 'is_monsoon']


In [19]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, classification_report

# Prepare features (same exclusions as before, plus MODIS)
exclude_cols = [
    'date', 'flooded', 'event_type', 'source', 'notes',
    'ward_name', 'ward_name_lc', 'ward_name_soil', 'named_canals',
    'primary_drainage_type', 'soil_type', 'boundary_position', 'WARD'
]

feature_cols_modis = []
for c in train_with_modis.columns:
    if c in exclude_cols or c == 'ward_id':
        continue
    if train_with_modis[c].dtype in ['object', 'datetime64[ns]']:
        continue
    feature_cols_modis.append(c)

print(f"Total features: {len(feature_cols_modis)}")

# Prepare X and y
X_modis = train_with_modis[feature_cols_modis].copy()
y_modis = train_with_modis['flooded'].copy()

# Temporal split
train_mask = train_with_modis['date'] < '2025-01-01'
test_mask = train_with_modis['date'] >= '2025-01-01'

X_train_m = X_modis[train_mask]
X_test_m = X_modis[test_mask]
y_train_m = y_modis[train_mask]
y_test_m = y_modis[test_mask]

print(f"Train: {len(X_train_m):,} | Test: {len(X_test_m):,}")

# Train
model_modis = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

model_modis.fit(X_train_m, y_train_m)

y_pred_m = model_modis.predict(X_test_m)
y_prob_m = model_modis.predict_proba(X_test_m)[:, 1]

print("Training complete!")

Total features: 121
Train: 36,801 | Test: 2,115
Training complete!


In [20]:
# Metrics
f1_m = f1_score(y_test_m, y_pred_m)
precision_m = precision_score(y_test_m, y_pred_m)
recall_m = recall_score(y_test_m, y_pred_m)
auc_m = roc_auc_score(y_test_m, y_prob_m)

print("=" * 60)
print("MODEL COMPARISON: Physical vs Physical + MODIS")
print("=" * 60)
print(f"{'Metric':<12} {'Physical Only':<15} {'+ MODIS':<15} {'Change':<10}")
print("-" * 52)
print(f"{'F1-Score':<12} {0.869:<15.3f} {f1_m:<15.3f} {f1_m - 0.869:+.3f}")
print(f"{'Precision':<12} {0.965:<15.3f} {precision_m:<15.3f} {precision_m - 0.965:+.3f}")
print(f"{'Recall':<12} {0.790:<15.3f} {recall_m:<15.3f} {recall_m - 0.790:+.3f}")
print(f"{'AUC-ROC':<12} {0.908:<15.3f} {auc_m:<15.3f} {auc_m - 0.908:+.3f}")

# Feature importance - did MODIS features matter?
importance_m = pd.DataFrame({
    'feature': feature_cols_modis,
    'importance': model_modis.feature_importances_
}).sort_values('importance', ascending=False)

print("\n--- Top 20 Features (with MODIS) ---")
print(importance_m.head(20).to_string(index=False))

# MODIS contribution
modis_cols = ['ndvi_mean', 'ndvi_std', 'ndvi_anomaly', 'ndvi_mean_change',
              'evi_mean', 'evi_std', 'evi_anomaly', 'evi_mean_change',
              'lst_day_c', 'lst_night_c', 'lst_diurnal_range', 'lst_day_c_change',
              'is_monsoon']
modis_importance = importance_m[importance_m['feature'].isin(modis_cols)]['importance'].sum()

print(f"\n--- MODIS Contribution ---")
print(f"MODIS features importance: {modis_importance:.1%}")

MODEL COMPARISON: Physical vs Physical + MODIS
Metric       Physical Only   + MODIS         Change    
----------------------------------------------------
F1-Score     0.869           0.869           -0.000
Precision    0.965           0.965           -0.000
Recall       0.790           0.790           +0.000
AUC-ROC      0.908           0.908           -0.000

--- Top 20 Features (with MODIS) ---
                       feature  importance
               has_named_canal    0.090105
         canal_length_major_km    0.080950
     building_size_variability    0.065690
                      clay_std    0.053305
             canal_count_major    0.049618
                 ward_area_km2    0.045759
    neighbor_avg_canal_density    0.040017
                      silt_pct    0.038410
               mean_confidence    0.038391
       imperviousness_estimate    0.033698
           neighbor_avg_runoff    0.029916
                neighbor_count    0.024656
         building_coverage_pct    0.023