In [1]:
import pandas as pd
import numpy as np

# Create comprehensive flood events dataset (2023-2025)
all_events = []

# 2023 Events
events_2023 = [
    {'date': '2023-09-02', 'wards': [68, 93]},
]

# 2024 Events  
events_2024 = [
    {'date': '2024-05-26', 'wards': [65, 68, 130, 93, 96, 99, 81, 74]},
    {'date': '2024-07-08', 'wards': [93, 65, 68, 73, 84, 87, 88, 67, 91, 107, 100, 110, 111, 
                                      93, 96, 99, 130, 12, 90, 91, 63, 66]},
    {'date': '2024-09-23', 'wards': [100, 110, 111, 93, 73, 84, 87, 88, 66, 65, 68, 130, 
                                      93, 96, 99, 81, 63, 46, 12, 67, 91, 107]},
    {'date': '2024-10-25', 'wards': [73, 46, 130, 66]},
]

# 2025 Events
events_2025 = [
    {'date': '2025-07-08', 'wards': [12, 63, 67, 91, 107, 90, 91, 93, 65, 68, 130, 93, 96, 
                                      99, 111, 66, 100, 110]},
    {'date': '2025-09-23', 'wards': [111, 93, 73, 84, 87, 88, 66, 65, 68, 74, 130, 93, 96, 
                                      99, 68, 90, 91, 67, 107, 65, 66, 67, 63, 12, 73]},
    {'date': '2025-10-10', 'wards': [63, 73, 84, 87, 88, 68, 74, 130, 93, 65, 67, 91, 107, 
                                      65, 66, 67]}
]

# Compile all events
for event_list, year in [(events_2023, 2023), (events_2024, 2024), (events_2025, 2025)]:
    for event in event_list:
        # Get unique ward numbers
        unique_wards = list(set(event['wards']))
        for ward in unique_wards:
            all_events.append({
                'date': event['date'],
                'ward': ward,
                'flooded': 1
            })

# Create DataFrame
flood_df = pd.DataFrame(all_events)

# Remove duplicates (same ward on same date)
flood_df = flood_df.drop_duplicates(subset=['date', 'ward'])

# Identify consistently flooding wards
flood_frequency = flood_df.groupby('ward').size().reset_index(name='flood_count')
print("Most frequently flooded wards (2023-2025):")
print(flood_frequency.sort_values('flood_count', ascending=False).head(10))

# Add some non-flooded examples for each event
# Assuming northern wards (1-40) generally don't flood as much
non_flooded_wards = []
for date in flood_df['date'].unique():
    flooded_on_date = flood_df[flood_df['date'] == date]['ward'].tolist()
    # Add some wards that didn't flood (from northern areas)
    safe_wards = [w for w in range(1, 40) if w not in flooded_on_date]
    # Sample 5-10 safe wards per event
    for ward in np.random.choice(safe_wards, min(10, len(safe_wards)), replace=False):
        non_flooded_wards.append({
            'date': date,
            'ward': ward,
            'flooded': 0
        })

non_flooded_df = pd.DataFrame(non_flooded_wards)

# Combine flooded and non-flooded
complete_flood_data = pd.concat([flood_df, non_flooded_df], ignore_index=True)

# Sort by date and ward
complete_flood_data = complete_flood_data.sort_values(['date', 'ward'])

# Save comprehensive dataset
complete_flood_data.to_csv('../data/processed/kolkata_flood_events_2023_2025.csv', index=False)

print(f"\nDataset Summary:")
print(f"Total records: {len(complete_flood_data)}")
print(f"Unique dates: {complete_flood_data['date'].nunique()}")
print(f"Flooded instances: {complete_flood_data['flooded'].sum()}")
print(f"Non-flooded instances: {(complete_flood_data['flooded'] == 0).sum()}")

# Identify high-risk wards (flooded in >50% of events)
total_events = complete_flood_data['date'].nunique()
ward_flood_rate = complete_flood_data.groupby('ward')['flooded'].mean()
high_risk_wards = ward_flood_rate[ward_flood_rate > 0.5].index.tolist()

print(f"\nHigh-risk wards (flooded >50% of events):")
print(sorted(high_risk_wards))

Most frequently flooded wards (2023-2025):
    ward  flood_count
22   130            7
6     68            7
15    93            7
3     65            6
4     66            6
14    91            5
2     63            5
5     67            5
7     73            5
19   107            5

Dataset Summary:
Total records: 184
Unique dates: 8
Flooded instances: 104
Non-flooded instances: 80

High-risk wards (flooded >50% of events):
[12, 46, 63, 65, 66, 67, 68, 73, 74, 81, 84, 87, 88, 90, 91, 93, 96, 99, 100, 107, 110, 111, 130]
