In [1]:
# ============================================================
# TRAINING LABELS NOTEBOOK - Cell 1: Setup
# ============================================================

import pandas as pd
import geopandas as gpd
from pathlib import Path

print("=" * 60)
print("TRAINING LABELS: NEWS-TO-WARD MAPPING")
print("=" * 60)

# Load ward boundaries for reference
wards_gdf = gpd.read_file('../data/wards/kmc_wards_gee_ready.geojson')
wards_gdf['WARD'] = wards_gdf['WARD'].str.strip()
print(f"‚úÖ Loaded {len(wards_gdf)} wards")

# ============================================================
# LOCALITY TO WARD MAPPING (Based on KMC Borough structure)
# ============================================================

# This mapping is based on KMC's 16 borough structure
# Multiple wards can share a locality name

LOCALITY_TO_WARDS = {
    # VERY HIGH FREQUENCY HOTSPOTS (8+ events)
    'Behala': ['120', '121', '122', '123', '124', '125', '126', '127', '128', '129'],
    'Jodhpur Park': ['86', '87'],
    'Garia': ['135', '136', '137', '138'],
    'Kamdahari': ['137', '138'],
    'College Street': ['44', '45', '46'],
    'Kalighat': ['82', '83', '84'],

    # HIGH FREQUENCY (5-7 events)
    'Ballygunge': ['63', '64', '65', '66', '67'],
    'Central Avenue': ['43', '44', '48'],
    'Tollygunge': ['99', '100', '101', '102', '103'],
    'Jadavpur': ['95', '96', '97', '98'],
    'Thanthania': ['47', '48'],
    'Salt Lake': ['106', '107', '108'],  # Note: partially outside KMC
    'New Alipore': ['117', '118', '119'],
    'Lake Gardens': ['88', '89', '90'],
    'Ultadanga': ['33', '34'],

    # MODERATE FREQUENCY
    'Shyambazar': ['6', '7', '8', '9'],
    'Lake Town': ['35', '36'],
    'Esplanade': ['49', '50', '51'],
    'Park Street': ['61', '62'],
    'EM Bypass': ['105', '106', '107', '108', '109'],
    'Kasba': ['92', '93', '94'],
    'New Town': [],  # Outside KMC
    'Dum Dum': ['1', '2', '3', '4', '5'],
    'Maniktala': ['27', '28', '29'],
    'Topsia': ['68', '69', '70'],
    'Tangra': ['57', '58', '59'],
    'Park Circus': ['71', '72', '73'],
    'Bhawanipur': ['74', '75', '76'],
    'Hazra': ['77', '78', '79'],
    'New Market': ['52', '53'],
    'Dharmatala': ['49', '50'],
    'VIP Road': ['5', '6', '35'],
    'Garden Reach': ['130', '131', '132', '133', '134'],
    'Cossipur': ['10', '11', '12'],
    'Regent Park': ['91'],
    'Beliaghata': ['37', '38', '39', '40'],

    # 2019 SPECIFIC WARDS (from your document)
    'Ward_111': ['111'],
    'Ward_114': ['114'],
    'Ward_109': ['109'],
    'Ward_122': ['122'],
    'Ward_127': ['127'],
}

# Reverse mapping for lookup
WARD_TO_LOCALITIES = {}
for locality, wards in LOCALITY_TO_WARDS.items():
    for ward in wards:
        if ward not in WARD_TO_LOCALITIES:
            WARD_TO_LOCALITIES[ward] = []
        WARD_TO_LOCALITIES[ward].append(locality)

print(f"‚úÖ Mapped {len(LOCALITY_TO_WARDS)} localities to wards")

TRAINING LABELS: NEWS-TO-WARD MAPPING
‚úÖ Loaded 141 wards
‚úÖ Mapped 41 localities to wards


In [2]:
# ============================================================
# Cell 2: FLOOD EVENTS DATABASE
# ============================================================

# Structured flood events from your document
FLOOD_EVENTS = [
    # 2015: Cyclone Komen
    {
        'event_id': '2015_KOMEN',
        'start_date': '2015-07-26',
        'end_date': '2015-08-02',
        'event_type': 'cyclone',
        'max_rainfall_mm': 87,
        'localities': ['Beliaghata', 'Central Avenue'],
        'confidence': 0.7,  # Tier 2: News geocoded
        'source': 'Cyclone Komen reports',
        'notes': 'Cholera outbreak followed'
    },

    # 2017: Major July Floods
    {
        'event_id': '2017_JULY_FLOOD',
        'start_date': '2017-07-20',
        'end_date': '2017-07-26',
        'event_type': 'monsoon',
        'max_rainfall_mm': 142,  # 48-hour peak
        'localities': ['Ultadanga', 'Central Avenue', 'Shyambazar', 'Garia',
                      'Esplanade', 'Lake Town', 'Behala', 'College Street', 'EM Bypass'],
        'confidence': 0.8,  # Tier 2: Multiple news sources
        'source': 'Government reports, news',
        'notes': '70+ wards inundated, 50+ deaths, ‚Çπ553 crore damage'
    },

    # 2018: June monsoon
    {
        'event_id': '2018_JUNE',
        'start_date': '2018-06-12',
        'end_date': '2018-06-12',
        'event_type': 'monsoon',
        'max_rainfall_mm': 100,  # Estimated from 419mm June total
        'localities': ['Bhawanipur', 'Central Avenue'],
        'confidence': 0.6,  # Tier 3: Area inference
        'source': 'News reports',
        'notes': 'High tide coincidence'
    },

    # 2019: August waterlogging (SPECIFIC WARDS!)
    {
        'event_id': '2019_AUG',
        'start_date': '2019-08-16',
        'end_date': '2019-08-17',
        'event_type': 'monsoon',
        'max_rainfall_mm': 80,  # Estimated
        'localities': ['Ward_111', 'Ward_114', 'Ward_109', 'Ward_122', 'Ward_127'],
        'confidence': 1.0,  # Tier 1: Ward explicit!
        'source': 'KMC records',
        'notes': 'Churial Canal undredged due to contractor insolvency'
    },

    # 2019: Cyclone Bulbul
    {
        'event_id': '2019_BULBUL',
        'start_date': '2019-11-09',
        'end_date': '2019-11-10',
        'event_type': 'cyclone',
        'max_rainfall_mm': 166,
        'localities': ['Ballygunge'],  # Tree fall fatality location
        'confidence': 0.7,
        'source': 'Cyclone reports',
        'notes': 'Wind damage primary, 1 death in Kolkata'
    },

    # 2020: CYCLONE AMPHAN (MAJOR)
    {
        'event_id': '2020_AMPHAN',
        'start_date': '2020-05-20',
        'end_date': '2020-05-21',
        'event_type': 'cyclone',
        'max_rainfall_mm': 236,
        'localities': ['Behala', 'College Street', 'New Alipore', 'Salt Lake',
                      'Ultadanga', 'Central Avenue', 'Shyambazar', 'Garia',
                      'Esplanade', 'Lake Town', 'Regent Park', 'Ballygunge'],
        'confidence': 0.9,  # Tier 1-2: Extensive documentation
        'source': 'Government reports, satellite imagery',
        'notes': 'Costliest cyclone in N Indian Ocean, 6.6 sq.km inundated, 19 deaths in Kolkata'
    },

    # 2021: Cyclone Yaas
    {
        'event_id': '2021_YAAS',
        'start_date': '2021-05-26',
        'end_date': '2021-05-27',
        'event_type': 'cyclone',
        'max_rainfall_mm': 90,  # Estimated
        'localities': ['Behala', 'Garden Reach'],  # Coastal areas
        'confidence': 0.6,
        'source': 'Cyclone reports',
        'notes': 'Landfall in Odisha, less impact on Kolkata proper'
    },

    # 2021: September monsoon
    {
        'event_id': '2021_SEP',
        'start_date': '2021-09-20',
        'end_date': '2021-09-21',
        'event_type': 'monsoon',
        'max_rainfall_mm': 142,
        'localities': ['Salt Lake', 'College Street', 'Central Avenue', 'Kalighat'],
        'confidence': 0.8,
        'source': 'IMD data, news',
        'notes': 'Heaviest September rain since 2007, 24-30h waterlogging'
    },

    # 2023: August flooding
    {
        'event_id': '2023_AUG',
        'start_date': '2023-08-15',
        'end_date': '2023-08-17',
        'event_type': 'monsoon',
        'max_rainfall_mm': 191,
        'localities': ['Tollygunge', 'Behala', 'Jadavpur', 'Garia'],
        'confidence': 0.7,
        'source': 'News reports',
        'notes': 'South/southwest Kolkata most affected, 48h+ waterlogging'
    },

    # 2024: Cyclone Remal
    {
        'event_id': '2024_REMAL',
        'start_date': '2024-05-26',
        'end_date': '2024-05-27',
        'event_type': 'cyclone',
        'max_rainfall_mm': 260,
        'localities': ['Park Street', 'Esplanade'],  # Metro stations flooded
        'confidence': 0.8,
        'source': 'News, metro reports',
        'notes': 'Metro tunnels flooded, 294 trees uprooted'
    },

    # 2024: Cyclone Dana aftermath
    {
        'event_id': '2024_DANA',
        'start_date': '2024-10-25',
        'end_date': '2024-10-26',
        'event_type': 'cyclone',
        'max_rainfall_mm': 190,  # Jodhpur Park
        'localities': ['Bhawanipur', 'New Market', 'Hazra', 'Dharmatala', 'Behala',
                      'Jodhpur Park', 'Lake Gardens', 'Kalighat', 'Thanthania',
                      'VIP Road', 'Park Circus', 'Dum Dum'],
        'confidence': 0.8,
        'source': 'News reports',
        'notes': '2 electrocution deaths, knee-deep flooding'
    },

    # 2025: July event
    {
        'event_id': '2025_JULY',
        'start_date': '2025-07-08',
        'end_date': '2025-07-08',
        'event_type': 'monsoon',
        'max_rainfall_mm': 195,  # Jodhpur Park
        'localities': ['Jodhpur Park', 'Dum Dum', 'Salt Lake', 'Maniktala',
                      'Thanthania', 'Central Avenue', 'College Street', 'Kasba',
                      'Tollygunge', 'Garia', 'Jadavpur', 'Behala', 'EM Bypass'],
        'confidence': 0.8,
        'source': 'IMD, news',
        'notes': 'Orange alert for 5 south Bengal districts'
    },

    # 2025: SEPTEMBER CLOUDBURST (MOST SEVERE NON-CYCLONE)
    {
        'event_id': '2025_SEP_CLOUDBURST',
        'start_date': '2025-09-23',
        'end_date': '2025-09-25',
        'event_type': 'cloudburst',
        'max_rainfall_mm': 332,  # Garia Kamdahari
        'localities': ['Garia', 'Kamdahari', 'Jodhpur Park', 'Kalighat', 'Topsia',
                      'Ballygunge', 'Jadavpur', 'Behala', 'Tollygunge'],
        'confidence': 0.9,
        'source': 'IMD, news, fatality reports',
        'notes': '6th highest rainfall in 137 years, 11-12 deaths, Ballygunge pump failure'
    },
]

print(f"‚úÖ Defined {len(FLOOD_EVENTS)} flood events")

‚úÖ Defined 13 flood events


In [3]:
# ============================================================
# Cell 3: GENERATE WARD-LEVEL LABELS
# ============================================================

def expand_event_to_wards(event, locality_mapping):
    """Expand a flood event to individual ward-date labels"""
    labels = []

    # Get all affected wards
    affected_wards = set()
    for locality in event['localities']:
        if locality in locality_mapping:
            affected_wards.update(locality_mapping[locality])

    # Generate date range
    start = pd.to_datetime(event['start_date'])
    end = pd.to_datetime(event['end_date'])
    dates = pd.date_range(start, end, freq='D')

    # Create labels for each ward-date combination
    for date in dates:
        for ward in affected_wards:
            labels.append({
                'date': date.strftime('%Y-%m-%d'),
                'ward_id': ward,
                'flooded': 1,
                'event_id': event['event_id'],
                'event_type': event['event_type'],
                'confidence': event['confidence'],
                'max_rainfall_mm': event['max_rainfall_mm'],
                'source': event['source'],
                'notes': event['notes']
            })

    return labels

# Generate all labels
all_labels = []
for event in FLOOD_EVENTS:
    event_labels = expand_event_to_wards(event, LOCALITY_TO_WARDS)
    all_labels.extend(event_labels)
    print(f"  {event['event_id']}: {len(event_labels)} ward-date labels")

df_flood_labels = pd.DataFrame(all_labels)

print(f"\n‚úÖ FLOOD LABELS GENERATED")
print(f"   Total positive labels: {len(df_flood_labels)}")
print(f"   Unique dates: {df_flood_labels['date'].nunique()}")
print(f"   Unique wards: {df_flood_labels['ward_id'].nunique()}")

# Summary by event
print("\nüìä LABELS BY EVENT:")
print(df_flood_labels.groupby('event_id').size().sort_values(ascending=False))

  2015_KOMEN: 56 ward-date labels
  2017_JULY_FLOOD: 245 ward-date labels
  2018_JUNE: 6 ward-date labels
  2019_AUG: 10 ward-date labels
  2019_BULBUL: 10 ward-date labels
  2020_AMPHAN: 84 ward-date labels
  2021_YAAS: 30 ward-date labels
  2021_SEP: 22 ward-date labels
  2023_AUG: 69 ward-date labels
  2024_REMAL: 10 ward-date labels
  2024_DANA: 80 ward-date labels
  2025_JULY: 47 ward-date labels
  2025_SEP_CLOUDBURST: 108 ward-date labels

‚úÖ FLOOD LABELS GENERATED
   Total positive labels: 777
   Unique dates: 37
   Unique wards: 100

üìä LABELS BY EVENT:
event_id
2017_JULY_FLOOD        245
2025_SEP_CLOUDBURST    108
2020_AMPHAN             84
2024_DANA               80
2023_AUG                69
2015_KOMEN              56
2025_JULY               47
2021_YAAS               30
2021_SEP                22
2019_AUG                10
2019_BULBUL             10
2024_REMAL              10
2018_JUNE                6
dtype: int64


In [4]:
# ============================================================
# Cell 4: ADD NEGATIVE LABELS (Non-flood days)
# ============================================================

# Get all ward IDs
all_wards = [str(i) for i in range(1, 142)]

# Get all dates from precipitation features
precip_dates = pd.read_csv(
    '../data/processed/precipitation_features_2014_2025.csv',
    usecols=['date']
)['date'].unique()

print(f"Total dates available: {len(precip_dates)}")

# Identify flood dates
flood_dates = set(df_flood_labels['date'].unique())
print(f"Flood dates: {len(flood_dates)}")

# Sample negative labels (dry season + low-rainfall monsoon days)
# Strategy: 3:1 ratio of negatives to positives for balanced training

import numpy as np
np.random.seed(42)

# Get dates with low rainfall (< 20mm) that aren't flood events
precip_df = pd.read_csv('../data/processed/precipitation_features_2014_2025.csv')
precip_df['date'] = pd.to_datetime(precip_df['date']).dt.strftime('%Y-%m-%d')

# City-wide daily average
daily_avg = precip_df.groupby('date')['rain_d0'].mean().reset_index()
low_rain_dates = daily_avg[daily_avg['rain_d0'] < 20]['date'].values

# Exclude flood dates
safe_dates = [d for d in low_rain_dates if d not in flood_dates]
print(f"Safe non-flood dates (< 20mm rain): {len(safe_dates)}")

# Sample negative labels
n_positives = len(df_flood_labels)
n_negatives_needed = n_positives * 3  # 3:1 ratio

# Sample dates and wards
negative_labels = []
dates_per_sample = n_negatives_needed // len(all_wards) + 1
sampled_dates = np.random.choice(safe_dates, size=min(dates_per_sample, len(safe_dates)), replace=False)

for date in sampled_dates:
    for ward in all_wards:
        negative_labels.append({
            'date': date,
            'ward_id': ward,
            'flooded': 0,
            'event_id': 'NO_FLOOD',
            'event_type': 'none',
            'confidence': 0.9,  # High confidence for dry days
            'max_rainfall_mm': 0,
            'source': 'GPM verification',
            'notes': 'Low rainfall day'
        })

df_negative = pd.DataFrame(negative_labels)

# Combine
df_all_labels = pd.concat([df_flood_labels, df_negative], ignore_index=True)

print(f"\n‚úÖ COMPLETE TRAINING LABELS")
print(f"   Positive (flooded): {len(df_flood_labels)}")
print(f"   Negative (no flood): {len(df_negative)}")
print(f"   Total: {len(df_all_labels)}")
print(f"   Class ratio: 1:{len(df_negative)//len(df_flood_labels)}")

Total dates available: 4346
Flood dates: 37
Safe non-flood dates (< 20mm rain): 3711

‚úÖ COMPLETE TRAINING LABELS
   Positive (flooded): 777
   Negative (no flood): 2397
   Total: 3174
   Class ratio: 1:3


In [5]:
# ============================================================
# Cell 5: SAVE AND SUMMARIZE
# ============================================================

# Save labels
OUTPUT_PATH = Path('../data/processed/training_labels_v1.csv')
df_all_labels.to_csv(OUTPUT_PATH, index=False)
print(f"‚úÖ Saved: {OUTPUT_PATH}")

# Also save just flood events for reference
df_flood_labels.to_csv('../data/processed/flood_events_positive_labels.csv', index=False)
print(f"‚úÖ Saved: flood_events_positive_labels.csv")

# Summary statistics
print("\n" + "=" * 60)
print("üìä TRAINING LABELS SUMMARY")
print("=" * 60)

print("\nüéØ BY CONFIDENCE TIER:")
print(df_flood_labels.groupby('confidence').size())

print("\nüéØ BY EVENT TYPE:")
print(df_flood_labels.groupby('event_type').size())

print("\nüéØ TOP 10 MOST FLOODED WARDS:")
ward_counts = df_flood_labels['ward_id'].value_counts().head(10)
print(ward_counts)

print("\nüéØ TEMPORAL DISTRIBUTION:")
df_flood_labels['year'] = pd.to_datetime(df_flood_labels['date']).dt.year
print(df_flood_labels.groupby('year').size())

‚úÖ Saved: ../data/processed/training_labels_v1.csv
‚úÖ Saved: flood_events_positive_labels.csv

üìä TRAINING LABELS SUMMARY

üéØ BY CONFIDENCE TIER:
confidence
0.6     36
0.7    135
0.8    404
0.9    192
1.0     10
dtype: int64

üéØ BY EVENT TYPE:
event_type
cloudburst    108
cyclone       270
monsoon       399
dtype: int64

üéØ TOP 10 MOST FLOODED WARDS:
ward_id
48     23
122    22
127    22
43     21
44     21
120    20
129    20
123    20
124    20
128    20
Name: count, dtype: int64

üéØ TEMPORAL DISTRIBUTION:
year
2015     56
2017    245
2018      6
2019     20
2020     84
2021     52
2023     69
2024     90
2025    155
dtype: int64


In [6]:
# ============================================================
# Cell 6: PHYSICS-BASED LABEL EXPANSION
# ============================================================

# Load precipitation features
precip_df = pd.read_csv('../data/processed/precipitation_features_2014_2025.csv')
precip_df['date'] = pd.to_datetime(precip_df['date'])

print("=" * 60)
print("PHYSICS-BASED LABEL EXPANSION")
print("=" * 60)

# KMC Drainage Capacity Threshold
DRAINAGE_CAPACITY_MM = 150  # 6mm/hour √ó 24 hours

# CHRONIC HOTSPOTS (from your document - flood in 5+ events)
CHRONIC_WARDS = {
    # Very High Frequency (8+ events)
    'Behala': ['120', '121', '122', '123', '124', '125', '126', '127', '128', '129'],
    'Jodhpur Park': ['86', '87'],
    'Garia': ['135', '136', '137', '138'],
    'College Street': ['44', '45', '46'],
    'Kalighat': ['82', '83', '84'],

    # High Frequency (5-7 events)
    'Ballygunge': ['63', '64', '65', '66', '67'],
    'Central Avenue': ['43', '44', '48'],
    'Tollygunge': ['99', '100', '101', '102', '103'],
    'Jadavpur': ['95', '96', '97', '98'],
    'Thanthania': ['47', '48'],
    'Salt Lake': ['106', '107', '108'],
    'New Alipore': ['117', '118', '119'],
    'Lake Gardens': ['88', '89', '90'],
    'Ultadanga': ['33', '34'],
    'Tangra-Topsia': ['57', '58', '59', '68', '69', '70'],
}

# Flatten to set of chronic ward IDs
chronic_ward_ids = set()
for wards in CHRONIC_WARDS.values():
    chronic_ward_ids.update(wards)

print(f"Chronic hotspot wards: {len(chronic_ward_ids)}")

# ============================================================
# RULE 1: Heavy rain (>150mm) + Chronic ward = FLOODED
# ============================================================

# Find days where city average exceeded drainage capacity
daily_city_avg = precip_df.groupby('date')['rain_d0'].mean().reset_index()
heavy_rain_dates = daily_city_avg[daily_city_avg['rain_d0'] >= DRAINAGE_CAPACITY_MM]['date']

print(f"\nDays exceeding {DRAINAGE_CAPACITY_MM}mm threshold: {len(heavy_rain_dates)}")

# Generate labels for chronic wards on heavy rain days
physics_labels_heavy = []
for date in heavy_rain_dates:
    date_str = date.strftime('%Y-%m-%d')
    for ward in chronic_ward_ids:
        physics_labels_heavy.append({
            'date': date_str,
            'ward_id': ward,
            'flooded': 1,
            'event_id': f'PHYSICS_HEAVY_{date_str}',
            'event_type': 'physics_inferred',
            'confidence': 0.7,  # High confidence - physics threshold
            'max_rainfall_mm': daily_city_avg[daily_city_avg['date'] == date]['rain_d0'].values[0],
            'source': 'Physics threshold + chronic hotspot',
            'notes': f'Rain exceeded {DRAINAGE_CAPACITY_MM}mm drainage capacity'
        })

print(f"Rule 1 (Heavy + Chronic): {len(physics_labels_heavy)} labels")

# ============================================================
# RULE 2: Moderate rain (>100mm) + Very High Frequency wards = FLOODED
# ============================================================

VERY_HIGH_FREQ_WARDS = set()
for locality in ['Behala', 'Jodhpur Park', 'Garia', 'College Street', 'Kalighat']:
    VERY_HIGH_FREQ_WARDS.update(CHRONIC_WARDS[locality])

moderate_rain_dates = daily_city_avg[
    (daily_city_avg['rain_d0'] >= 100) &
    (daily_city_avg['rain_d0'] < DRAINAGE_CAPACITY_MM)
]['date']

print(f"Days with 100-150mm: {len(moderate_rain_dates)}")

physics_labels_moderate = []
for date in moderate_rain_dates:
    date_str = date.strftime('%Y-%m-%d')
    for ward in VERY_HIGH_FREQ_WARDS:
        physics_labels_moderate.append({
            'date': date_str,
            'ward_id': ward,
            'flooded': 1,
            'event_id': f'PHYSICS_MODERATE_{date_str}',
            'event_type': 'physics_inferred',
            'confidence': 0.5,  # Lower confidence
            'max_rainfall_mm': daily_city_avg[daily_city_avg['date'] == date]['rain_d0'].values[0],
            'source': 'Moderate rain + very high frequency hotspot',
            'notes': 'Rain 100-150mm in historically vulnerable ward'
        })

print(f"Rule 2 (Moderate + VeryHigh): {len(physics_labels_moderate)} labels")

# ============================================================
# RULE 3: 7-day accumulation > 400mm = Saturated ground flooding
# ============================================================

# Check 7-day accumulation
daily_7day = precip_df.groupby('date')['rain_7day'].mean().reset_index()
saturated_dates = daily_7day[daily_7day['rain_7day'] >= 400]['date']

print(f"Days with 7-day accumulation > 400mm: {len(saturated_dates)}")

physics_labels_saturated = []
for date in saturated_dates:
    date_str = date.strftime('%Y-%m-%d')
    # Only very high frequency wards for saturation flooding
    for ward in VERY_HIGH_FREQ_WARDS:
        physics_labels_saturated.append({
            'date': date_str,
            'ward_id': ward,
            'flooded': 1,
            'event_id': f'PHYSICS_SATURATED_{date_str}',
            'event_type': 'physics_inferred',
            'confidence': 0.4,  # Lower confidence - indirect indicator
            'max_rainfall_mm': daily_7day[daily_7day['date'] == date]['rain_7day'].values[0],
            'source': 'Ground saturation inference',
            'notes': '7-day accumulation exceeded 400mm'
        })

print(f"Rule 3 (Saturation): {len(physics_labels_saturated)} labels")

PHYSICS-BASED LABEL EXPANSION
Chronic hotspot wards: 56

Days exceeding 150mm threshold: 13
Rule 1 (Heavy + Chronic): 728 labels
Days with 100-150mm: 48
Rule 2 (Moderate + VeryHigh): 1056 labels
Days with 7-day accumulation > 400mm: 47
Rule 3 (Saturation): 1034 labels


In [7]:
# ============================================================
# Cell 7: COMBINE ALL LABELS
# ============================================================

# Load existing news-based labels
df_news = pd.read_csv('../data/processed/flood_events_positive_labels.csv')
df_news['date'] = pd.to_datetime(df_news['date']).dt.strftime('%Y-%m-%d')

# Combine physics-based labels
df_physics = pd.concat([
    pd.DataFrame(physics_labels_heavy),
    pd.DataFrame(physics_labels_moderate),
    pd.DataFrame(physics_labels_saturated)
], ignore_index=True)

# Merge, keeping higher confidence when duplicates exist
df_all_positive = pd.concat([df_news, df_physics], ignore_index=True)

# Remove duplicates, keeping highest confidence
df_all_positive = df_all_positive.sort_values('confidence', ascending=False)
df_all_positive = df_all_positive.drop_duplicates(subset=['date', 'ward_id'], keep='first')

print("=" * 60)
print("üìä EXPANDED POSITIVE LABELS")
print("=" * 60)
print(f"\nNews-based labels: {len(df_news)}")
print(f"Physics-based labels: {len(df_physics)}")
print(f"Combined (deduplicated): {len(df_all_positive)}")

print("\nüéØ BY SOURCE:")
print(df_all_positive['source'].value_counts())

print("\nüéØ BY CONFIDENCE:")
print(df_all_positive['confidence'].value_counts().sort_index())

print("\nüéØ BY YEAR:")
df_all_positive['year'] = pd.to_datetime(df_all_positive['date']).dt.year
print(df_all_positive.groupby('year').size())

üìä EXPANDED POSITIVE LABELS

News-based labels: 777
Physics-based labels: 2818
Combined (deduplicated): 3441

üéØ BY SOURCE:
source
Moderate rain + very high frequency hotspot    1056
Ground saturation inference                     880
Physics threshold + chronic hotspot             728
Government reports, news                        245
News reports                                    155
IMD, news, fatality reports                     108
Government reports, satellite imagery            84
Cyclone Komen reports                            56
IMD, news                                        47
Cyclone reports                                  40
IMD data, news                                   22
KMC records                                      10
News, metro reports                              10
Name: count, dtype: int64

üéØ BY CONFIDENCE:
confidence
0.4     880
0.5    1056
0.6      36
0.7     863
0.8     404
0.9     192
1.0      10
Name: count, dtype: int64

üéØ BY YEAR:
year
2

In [8]:
# ============================================================
# Cell 8: CREATE BALANCED TRAINING SET
# ============================================================

# Load precipitation for negative sampling
precip_df = pd.read_csv('../data/processed/precipitation_features_2014_2025.csv')
precip_df['date'] = pd.to_datetime(precip_df['date']).dt.strftime('%Y-%m-%d')

# Get all flood dates
flood_dates = set(df_all_positive['date'].unique())

# Sample negatives: dry season days + low rainfall monsoon days
daily_avg = precip_df.groupby('date')['rain_d0'].mean().reset_index()

# Very safe negatives: < 10mm rain and NOT a flood date
very_safe_dates = daily_avg[
    (daily_avg['rain_d0'] < 10) &
    (~daily_avg['date'].isin(flood_dates))
]['date'].values

print(f"Very safe negative dates (<10mm, no flood): {len(very_safe_dates)}")

# Sample for balance (2:1 negative:positive ratio)
import numpy as np
np.random.seed(42)

n_positives = len(df_all_positive)
n_neg_dates_needed = (n_positives * 2) // 141 + 1  # Divide by wards

sampled_neg_dates = np.random.choice(
    very_safe_dates,
    size=min(n_neg_dates_needed, len(very_safe_dates)),
    replace=False
)

# Generate negative labels for ALL wards on safe dates
all_wards = [str(i) for i in range(1, 142)]
negative_labels = []

for date in sampled_neg_dates:
    for ward in all_wards:
        negative_labels.append({
            'date': date,
            'ward_id': ward,
            'flooded': 0,
            'event_id': 'NO_FLOOD',
            'event_type': 'none',
            'confidence': 0.95,
            'max_rainfall_mm': 0,
            'source': 'Dry day verification',
            'notes': f'Rain < 10mm'
        })

df_negative = pd.DataFrame(negative_labels)

# Combine final dataset
df_training = pd.concat([df_all_positive, df_negative], ignore_index=True)

print(f"\n‚úÖ FINAL TRAINING SET")
print(f"   Positive (flooded): {len(df_all_positive)}")
print(f"   Negative (no flood): {len(df_negative)}")
print(f"   Total: {len(df_training)}")
print(f"   Class ratio: 1:{len(df_negative)//len(df_all_positive):.1f}")

# Save
df_training.to_csv('../data/processed/training_labels_v2_physics.csv', index=False)
df_all_positive.to_csv('../data/processed/flood_events_positive_v2.csv', index=False)

print(f"\n‚úÖ Saved: training_labels_v2_physics.csv")
print(f"‚úÖ Saved: flood_events_positive_v2.csv")

Very safe negative dates (<10mm, no flood): 3362

‚úÖ FINAL TRAINING SET
   Positive (flooded): 3441
   Negative (no flood): 6909
   Total: 10350
   Class ratio: 1:2.0

‚úÖ Saved: training_labels_v2_physics.csv
‚úÖ Saved: flood_events_positive_v2.csv


In [9]:
# ============================================================
# THRESHOLD VALIDATION: What rainfall actually caused flooding?
# ============================================================

import pandas as pd
import numpy as np

# Load data
precip_df = pd.read_csv('../data/processed/precipitation_features_2014_2025.csv')
precip_df['date'] = pd.to_datetime(precip_df['date']).dt.strftime('%Y-%m-%d')

news_labels = pd.read_csv('../data/processed/flood_events_positive_labels.csv')

# Get unique flood dates from NEWS (verified events)
news_flood_dates = news_labels['date'].unique()

print("=" * 60)
print("RAINFALL ON VERIFIED FLOOD DAYS")
print("=" * 60)

# City-wide average rainfall on flood days
daily_avg = precip_df.groupby('date').agg({
    'rain_d0': 'mean',
    'rain_d1': 'mean',
    'rain_d2': 'mean',
    'rain_d3': 'mean',
    'rain_7day': 'mean'
}).reset_index()

flood_day_rain = daily_avg[daily_avg['date'].isin(news_flood_dates)].copy()
flood_day_rain = flood_day_rain.sort_values('rain_d0', ascending=False)

print(f"\nüìä VERIFIED FLOOD DAYS ({len(flood_day_rain)} days):")
print(flood_day_rain[['date', 'rain_d0', 'rain_7day']].head(20).to_string(index=False))

print(f"\nüìà RAINFALL STATISTICS ON FLOOD DAYS:")
print(f"   Same-day (rain_d0):")
print(f"      Min:    {flood_day_rain['rain_d0'].min():.1f} mm")
print(f"      25%:    {flood_day_rain['rain_d0'].quantile(0.25):.1f} mm")
print(f"      Median: {flood_day_rain['rain_d0'].median():.1f} mm")
print(f"      75%:    {flood_day_rain['rain_d0'].quantile(0.75):.1f} mm")
print(f"      Max:    {flood_day_rain['rain_d0'].max():.1f} mm")

print(f"\n   7-day accumulation:")
print(f"      Min:    {flood_day_rain['rain_7day'].min():.1f} mm")
print(f"      Median: {flood_day_rain['rain_7day'].median():.1f} mm")
print(f"      Max:    {flood_day_rain['rain_7day'].max():.1f} mm")

RAINFALL ON VERIFIED FLOOD DAYS

üìä VERIFIED FLOOD DAYS (37 days):
      date    rain_d0  rain_7day
2024-10-25 222.644533 314.753822
2019-11-09 160.317585 237.189072
2021-09-20 151.738224 594.295164
2017-07-23 150.850565 398.857084
2019-08-16 123.110423 386.146941
2015-08-01 122.958323 354.835198
2019-08-17 122.675458 503.460910
2020-05-20 117.894536 155.420280
2015-07-31 108.891487 321.409780
2017-07-22 102.069856 249.158080
2024-05-26  65.606453 114.891770
2024-05-27  61.734396 135.709997
2017-07-24  59.853261 458.002118
2025-09-25  56.573119 317.455666
2025-07-08  56.220283 217.221697
2015-07-29  51.688723 285.899000
2018-06-12  45.388864 157.680776
2017-07-20  41.082694 155.964678
2023-08-17  39.455176 162.833046
2015-08-02  36.803049 380.270729

üìà RAINFALL STATISTICS ON FLOOD DAYS:
   Same-day (rain_d0):
      Min:    0.0 mm
      25%:    14.9 mm
      Median: 39.5 mm
      75%:    102.1 mm
      Max:    222.6 mm

   7-day accumulation:
      Min:    59.6 mm
      Median: 277

In [10]:
# ============================================================
# FIND THE TRUE THRESHOLD
# ============================================================

print("\n" + "=" * 60)
print("THRESHOLD ANALYSIS")
print("=" * 60)

# What % of flood days had rainfall above various thresholds?
thresholds = [50, 75, 100, 125, 150, 175, 200]

print("\nüéØ SAME-DAY RAINFALL (rain_d0):")
for thresh in thresholds:
    pct = (flood_day_rain['rain_d0'] >= thresh).mean() * 100
    count = (flood_day_rain['rain_d0'] >= thresh).sum()
    print(f"   >= {thresh:3d}mm: {pct:5.1f}% ({count}/{len(flood_day_rain)} days)")

print("\nüéØ BUT WAIT - What about lagged rainfall?")
# Check if floods occurred with LOW same-day but HIGH previous days
low_sameday_floods = flood_day_rain[flood_day_rain['rain_d0'] < 50]
print(f"\nFlood days with < 50mm same-day rain: {len(low_sameday_floods)}")
if len(low_sameday_floods) > 0:
    print(low_sameday_floods[['date', 'rain_d0', 'rain_d1', 'rain_d2', 'rain_d3', 'rain_7day']].to_string(index=False))


THRESHOLD ANALYSIS

üéØ SAME-DAY RAINFALL (rain_d0):
   >=  50mm:  43.2% (16/37 days)
   >=  75mm:  27.0% (10/37 days)
   >= 100mm:  27.0% (10/37 days)
   >= 125mm:  10.8% (4/37 days)
   >= 150mm:  10.8% (4/37 days)
   >= 175mm:   2.7% (1/37 days)
   >= 200mm:   2.7% (1/37 days)

üéØ BUT WAIT - What about lagged rainfall?

Flood days with < 50mm same-day rain: 21
      date   rain_d0    rain_d1    rain_d2    rain_d3  rain_7day
2018-06-12 45.388864  18.991134  24.674397  23.777304 157.680776
2017-07-20 41.082694  77.265105   2.754752   0.708227 155.964678
2023-08-17 39.455176  20.795602  17.682127   0.310993 162.833046
2015-08-02 36.803049 122.958323 108.891487  26.215532 380.270729
2017-07-25 35.572623  59.853261 150.850565 102.069856 490.819989
2021-05-27 31.657943  14.923049  11.651205  25.341772  91.301062
2015-07-30 26.215532  51.688723  13.301205  20.412411 287.604603
2017-07-21 24.125886  41.082694  77.265105   2.754752 155.455174
2023-08-16 20.795602  17.682127   0.310993  14

In [11]:
# ============================================================
# COMPARE: FLOOD DAYS vs NON-FLOOD DAYS
# ============================================================

print("\n" + "=" * 60)
print("FLOOD vs NON-FLOOD DAY COMPARISON")
print("=" * 60)

non_flood_days = daily_avg[~daily_avg['date'].isin(news_flood_dates)]

print(f"\nüìä Rainfall Distribution:")
print(f"{'Metric':<20} {'Flood Days':>15} {'Non-Flood Days':>15}")
print("-" * 50)
print(f"{'Count':<20} {len(flood_day_rain):>15} {len(non_flood_days):>15}")
print(f"{'Mean rain_d0':<20} {flood_day_rain['rain_d0'].mean():>15.1f} {non_flood_days['rain_d0'].mean():>15.1f}")
print(f"{'Median rain_d0':<20} {flood_day_rain['rain_d0'].median():>15.1f} {non_flood_days['rain_d0'].median():>15.1f}")
print(f"{'Mean rain_7day':<20} {flood_day_rain['rain_7day'].mean():>15.1f} {non_flood_days['rain_7day'].mean():>15.1f}")

# How many non-flood days had HIGH rainfall? (potential missed floods)
print(f"\n‚ö†Ô∏è NON-FLOOD DAYS WITH HIGH RAINFALL (potential gaps):")
high_rain_non_flood = non_flood_days[non_flood_days['rain_d0'] >= 100]
print(f"   Days with >= 100mm but NO flood label: {len(high_rain_non_flood)}")
high_rain_non_flood_150 = non_flood_days[non_flood_days['rain_d0'] >= 150]
print(f"   Days with >= 150mm but NO flood label: {len(high_rain_non_flood_150)}")

if len(high_rain_non_flood_150) > 0:
    print("\n   These dates likely had unreported flooding:")
    print(high_rain_non_flood_150[['date', 'rain_d0', 'rain_7day']].head(10).to_string(index=False))


FLOOD vs NON-FLOOD DAY COMPARISON

üìä Rainfall Distribution:
Metric                    Flood Days  Non-Flood Days
--------------------------------------------------
Count                             37            4309
Mean rain_d0                    58.0             8.9
Median rain_d0                  39.5             0.1
Mean rain_7day                 275.1            63.7

‚ö†Ô∏è NON-FLOOD DAYS WITH HIGH RAINFALL (potential gaps):
   Days with >= 100mm but NO flood label: 51
   Days with >= 150mm but NO flood label: 9

   These dates likely had unreported flooding:
      date    rain_d0  rain_7day
2014-09-20 205.684534 223.705952
2015-07-09 216.035030 417.721976
2016-08-21 224.634889 307.646447
2017-06-19 169.701839 187.558931
2021-07-01 204.315414 307.451014
2021-07-29 298.747795 574.776796
2021-09-14 217.739073 303.120419
2024-05-06 163.297939 164.664109
2025-09-20 161.311202 285.521695


In [12]:
# ============================================================
# Cell: DATA-DRIVEN PHYSICS RULES
# ============================================================

import pandas as pd
import numpy as np

print("=" * 60)
print("DATA-DRIVEN FLOOD LABELING")
print("=" * 60)

# Load data
precip_df = pd.read_csv('../data/processed/precipitation_features_2014_2025.csv')
precip_df['date'] = pd.to_datetime(precip_df['date']).dt.strftime('%Y-%m-%d')

# Load existing news labels
news_labels = pd.read_csv('../data/processed/flood_events_positive_labels.csv')
news_flood_dates = set(news_labels['date'].unique())

# City-wide daily stats
daily_avg = precip_df.groupby('date').agg({
    'rain_d0': 'mean',
    'rain_d1': 'mean',
    'rain_d2': 'mean',
    'rain_d3': 'mean',
    'rain_7day': 'mean'
}).reset_index()

# ============================================================
# EMPIRICAL THRESHOLDS (from your flood day statistics)
# ============================================================

# From your data:
# - Flood day 7-day median: 277mm, 25th percentile: ~160mm
# - Non-flood day 7-day mean: 64mm
# - Clear separation around 200-250mm 7-day accumulation

RULE_THRESHOLDS = {
    'heavy_sameday': 150,      # Drainage capacity (engineering limit)
    'moderate_sameday': 100,   # Still significant
    'heavy_7day': 250,         # Above flood-day median
    'moderate_7day': 200,      # Between distributions
    'heavy_lag': 100,          # Significant prior-day rain
}

# Chronic wards (from document)
VERY_HIGH_FREQ_WARDS = set()
for locality in ['Behala', 'Jodhpur Park', 'Garia', 'College Street', 'Kalighat']:
    VERY_HIGH_FREQ_WARDS.update({
        '120', '121', '122', '123', '124', '125', '126', '127', '128', '129',  # Behala
        '86', '87',  # Jodhpur Park
        '135', '136', '137', '138',  # Garia
        '44', '45', '46',  # College Street
        '82', '83', '84',  # Kalighat
    })

HIGH_FREQ_WARDS = VERY_HIGH_FREQ_WARDS.union({
    '63', '64', '65', '66', '67',  # Ballygunge
    '43', '48',  # Central Avenue
    '99', '100', '101', '102', '103',  # Tollygunge
    '95', '96', '97', '98',  # Jadavpur
    '33', '34',  # Ultadanga
})

print(f"Very high frequency wards: {len(VERY_HIGH_FREQ_WARDS)}")
print(f"High frequency wards: {len(HIGH_FREQ_WARDS)}")

# ============================================================
# RULE 1: Heavy same-day (‚â•150mm) - ANY ward floods
# Engineering certainty: drainage capacity exceeded
# ============================================================

rule1_dates = daily_avg[daily_avg['rain_d0'] >= RULE_THRESHOLDS['heavy_sameday']]['date'].values
print(f"\nRule 1 dates (‚â•150mm same-day): {len(rule1_dates)}")

rule1_labels = []
all_wards = [str(i) for i in range(1, 142)]

for date in rule1_dates:
    if date not in news_flood_dates:  # Don't duplicate
        # All chronic wards flood when drainage exceeded
        for ward in HIGH_FREQ_WARDS:
            rule1_labels.append({
                'date': date,
                'ward_id': ward,
                'flooded': 1,
                'event_id': f'PHYSICS_R1_{date}',
                'event_type': 'physics_drainage_exceeded',
                'confidence': 0.85,
                'source': 'Drainage capacity exceeded (‚â•150mm)',
                'notes': f"Same-day: {daily_avg[daily_avg['date']==date]['rain_d0'].values[0]:.0f}mm"
            })

print(f"Rule 1 labels: {len(rule1_labels)}")

# ============================================================
# RULE 2: High 7-day accumulation (‚â•250mm) + chronic ward
# Empirical: median flood-day accumulation
# ============================================================

rule2_dates = daily_avg[
    (daily_avg['rain_7day'] >= RULE_THRESHOLDS['heavy_7day']) &
    (~daily_avg['date'].isin(news_flood_dates)) &
    (~daily_avg['date'].isin(rule1_dates))
]['date'].values

print(f"Rule 2 dates (‚â•250mm 7-day, not in R1): {len(rule2_dates)}")

rule2_labels = []
for date in rule2_dates:
    for ward in VERY_HIGH_FREQ_WARDS:
        rule2_labels.append({
            'date': date,
            'ward_id': ward,
            'flooded': 1,
            'event_id': f'PHYSICS_R2_{date}',
            'event_type': 'physics_saturation',
            'confidence': 0.7,
            'source': '7-day accumulation ‚â•250mm + chronic ward',
            'notes': f"7-day: {daily_avg[daily_avg['date']==date]['rain_7day'].values[0]:.0f}mm"
        })

print(f"Rule 2 labels: {len(rule2_labels)}")

# ============================================================
# RULE 3: Lag effect - Heavy rain 1-3 days ago + chronic ward
# Empirical: Sep 23, Oct 26 pattern
# ============================================================

# Find days where PREVIOUS days had heavy rain but same-day is low
daily_avg['max_lag'] = daily_avg[['rain_d1', 'rain_d2', 'rain_d3']].max(axis=1)

rule3_dates = daily_avg[
    (daily_avg['rain_d0'] < 50) &  # Low same-day
    (daily_avg['max_lag'] >= RULE_THRESHOLDS['heavy_lag']) &  # Heavy recent
    (daily_avg['rain_7day'] >= RULE_THRESHOLDS['moderate_7day']) &  # Accumulated
    (~daily_avg['date'].isin(news_flood_dates)) &
    (~daily_avg['date'].isin(rule1_dates)) &
    (~daily_avg['date'].isin(rule2_dates))
]['date'].values

print(f"Rule 3 dates (lag effect): {len(rule3_dates)}")

rule3_labels = []
for date in rule3_dates:
    for ward in VERY_HIGH_FREQ_WARDS:
        row = daily_avg[daily_avg['date'] == date].iloc[0]
        rule3_labels.append({
            'date': date,
            'ward_id': ward,
            'flooded': 1,
            'event_id': f'PHYSICS_R3_{date}',
            'event_type': 'physics_lag_effect',
            'confidence': 0.6,
            'source': 'Lag effect (heavy rain 1-3 days prior)',
            'notes': f"Same-day: {row['rain_d0']:.0f}mm, Max lag: {row['max_lag']:.0f}mm"
        })

print(f"Rule 3 labels: {len(rule3_labels)}")

DATA-DRIVEN FLOOD LABELING
Very high frequency wards: 22
High frequency wards: 40

Rule 1 dates (‚â•150mm same-day): 13
Rule 1 labels: 360
Rule 2 dates (‚â•250mm 7-day, not in R1): 205
Rule 2 labels: 4510
Rule 3 dates (lag effect): 25
Rule 3 labels: 550


In [13]:
# ============================================================
# Cell: COMBINE AND VALIDATE
# ============================================================

# Combine all physics labels
df_physics = pd.concat([
    pd.DataFrame(rule1_labels),
    pd.DataFrame(rule2_labels),
    pd.DataFrame(rule3_labels)
], ignore_index=True)

# Load news labels
df_news = pd.read_csv('../data/processed/flood_events_positive_labels.csv')

# Combine
df_all_positive = pd.concat([df_news, df_physics], ignore_index=True)
df_all_positive = df_all_positive.sort_values('confidence', ascending=False)
df_all_positive = df_all_positive.drop_duplicates(subset=['date', 'ward_id'], keep='first')

print("=" * 60)
print("üìä FINAL POSITIVE LABELS (DATA-DRIVEN)")
print("=" * 60)

print(f"\n‚úÖ News-based: {len(df_news)}")
print(f"‚úÖ Rule 1 (drainage exceeded): {len(rule1_labels)}")
print(f"‚úÖ Rule 2 (7-day saturation): {len(rule2_labels)}")
print(f"‚úÖ Rule 3 (lag effect): {len(rule3_labels)}")
print(f"‚úÖ Total (deduplicated): {len(df_all_positive)}")

print("\nüéØ BY SOURCE:")
print(df_all_positive['source'].value_counts())

print("\nüéØ BY CONFIDENCE:")
print(df_all_positive['confidence'].value_counts().sort_index())

# Validate: Check if we captured the 9 missing high-rain days
print("\n" + "=" * 60)
print("‚ö†Ô∏è VALIDATION: Did we capture the missing 150mm+ days?")
print("=" * 60)

missing_dates = [
    '2014-09-20', '2015-07-09', '2016-08-21', '2017-06-19',
    '2021-07-01', '2021-07-29', '2021-09-14', '2024-05-06', '2025-09-20'
]

physics_dates = set(df_all_positive['date'].unique())
for date in missing_dates:
    status = "‚úÖ CAPTURED" if date in physics_dates else "‚ùå MISSING"
    rain = daily_avg[daily_avg['date'] == date]['rain_d0'].values
    rain_str = f"{rain[0]:.0f}mm" if len(rain) > 0 else "N/A"
    print(f"   {date}: {rain_str} - {status}")

üìä FINAL POSITIVE LABELS (DATA-DRIVEN)

‚úÖ News-based: 777
‚úÖ Rule 1 (drainage exceeded): 360
‚úÖ Rule 2 (7-day saturation): 4510
‚úÖ Rule 3 (lag effect): 550
‚úÖ Total (deduplicated): 6197

üéØ BY SOURCE:
source
7-day accumulation ‚â•250mm + chronic ward    4510
Lag effect (heavy rain 1-3 days prior)       550
Drainage capacity exceeded (‚â•150mm)          360
Government reports, news                     245
News reports                                 155
IMD, news, fatality reports                  108
Government reports, satellite imagery         84
Cyclone Komen reports                         56
IMD, news                                     47
Cyclone reports                               40
IMD data, news                                22
KMC records                                   10
News, metro reports                           10
Name: count, dtype: int64

üéØ BY CONFIDENCE:
confidence
0.60     586
0.70    4645
0.80     404
0.85     360
0.90     192
1.00      10
Name:

In [14]:
# ============================================================
# Cell: CREATE BALANCED TRAINING SET
# ============================================================

# Safe negatives: low rain AND low accumulation
safe_neg_dates = daily_avg[
    (daily_avg['rain_d0'] < 10) &
    (daily_avg['rain_7day'] < 100) &
    (~daily_avg['date'].isin(df_all_positive['date'].unique()))
]['date'].values

print(f"\nSafe negative dates: {len(safe_neg_dates)}")

# Sample for 2:1 ratio
np.random.seed(42)
n_pos = len(df_all_positive)
n_neg_dates = (n_pos * 2) // 141 + 1

sampled_dates = np.random.choice(safe_neg_dates, size=min(n_neg_dates, len(safe_neg_dates)), replace=False)

negative_labels = []
all_wards = [str(i) for i in range(1, 142)]

for date in sampled_dates:
    for ward in all_wards:
        negative_labels.append({
            'date': date,
            'ward_id': ward,
            'flooded': 0,
            'event_id': 'NO_FLOOD',
            'event_type': 'none',
            'confidence': 0.95,
            'source': 'Verified dry (<10mm, <100mm 7-day)',
            'notes': ''
        })

df_negative = pd.DataFrame(negative_labels)
df_training = pd.concat([df_all_positive, df_negative], ignore_index=True)

print(f"\n‚úÖ FINAL TRAINING SET (DATA-DRIVEN)")
print(f"   Positive: {len(df_all_positive)}")
print(f"   Negative: {len(df_negative)}")
print(f"   Total: {len(df_training)}")
print(f"   Ratio: 1:{len(df_negative)/len(df_all_positive):.1f}")

# Save
df_training.to_csv('../data/processed/training_labels_v3_empirical.csv', index=False)
df_all_positive.to_csv('../data/processed/flood_events_positive_v3.csv', index=False)

print(f"\n‚úÖ Saved: training_labels_v3_empirical.csv")
print(f"‚úÖ Saved: flood_events_positive_v3.csv")


Safe negative dates: 2877

‚úÖ FINAL TRAINING SET (DATA-DRIVEN)
   Positive: 6197
   Negative: 12408
   Total: 18605
   Ratio: 1:2.0

‚úÖ Saved: training_labels_v3_empirical.csv
‚úÖ Saved: flood_events_positive_v3.csv
