# Chronicle Flood Database Intersection with GHSL Built Area dataset

This notebook provides a comprehensive analysis of the Chronicle urban flood dataset, containing over 880,000 flood events worldwide from 2000-2025.


# Imports & Configuration

In [8]:
import pandas as pd
import geopandas as gpd
from shapely import wkt, geometry
from shapely.geometry import box
import rasterio
from rasterstats import zonal_stats
import numpy as np
import folium
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import math
# --- CONFIGURATION ---

# File Paths
chronicle_ds_csv_path = r"D:\Development\RESEARCH\urban_flood_database\chronicle\chronicle_preliminary_huji.csv"
ghs_urban_raster_path = r"D:\Development\RESEARCH\Raanana\data\esa_worldcover\GHS_BUILT_S_E2020_GLOBE_R2023A_54009_100_V1_0.tif"

# Processing Settings
# Set to an integer (e.g., 10000) for testing, or None for full dataset
SAMPLE_SIZE = None

# Data Loading & Processing

In [None]:
if SAMPLE_SIZE:
    chronicle_raw_df = pd.read_csv(chronicle_ds_csv_path, nrows=SAMPLE_SIZE)
else:
    chronicle_raw_df = pd.read_csv(chronicle_ds_csv_path)

# 2. Parse Geometries
print("Parsing WKT geometries...")
chronicle_raw_df['geometry'] = chronicle_raw_df['geometry_wkt'].apply(wkt.loads)

# 3. Create GeoDataFrame
chronicle_gdf = gpd.GeoDataFrame(chronicle_raw_df, geometry='geometry')
chronicle_gdf.set_crs(epsg=4326, inplace=True)

# 4. Reproject
with rasterio.open(ghs_urban_raster_path) as src:
    ghs_raster_crs = src.crs
    ghs_nodata_value = src.nodata
    print(f"Raster CRS: {ghs_raster_crs}")
    
    print("Reprojecting polygons...")
    chronicle_projected_gdf = chronicle_gdf.to_crs(ghs_raster_crs)

# 5. Calculate Zonal Statistics (WITH PROGRESS TRACKING)
print("Calculating zonal stats in chunks...")

# -- PROGRESS LOGIC START --
total_rows = len(chronicle_projected_gdf)
# We want 10 updates, so we divide total rows by 10
chunk_size = math.ceil(total_rows / 10) 
all_stats = []

# Iterate through the DataFrame in chunks
for i in range(0, total_rows, chunk_size):
    # Slice the dataframe
    subset = chronicle_projected_gdf.iloc[i : i + chunk_size]
    
    # Run stats just for this chunk
    chunk_stats = zonal_stats(
        subset,
        ghs_urban_raster_path,
        stats=['sum'],
        nodata=ghs_nodata_value
    )
    
    # Accumulate results
    all_stats.extend(chunk_stats)
    
    # Calculate and print progress
    current_row = min(i + chunk_size, total_rows)
    percent_complete = int((current_row / total_rows) * 100)
    print(f"{percent_complete}% complete... ({current_row}/{total_rows})")

# Convert accumulated list to DataFrame
urban_stats_df = pd.DataFrame(all_stats)
# -- PROGRESS LOGIC END --

# 6. Calculate Urban Percentage
print("Calculating percentages...")
chronicle_projected_gdf['urban_built_up_area_m2'] = urban_stats_df['sum'].fillna(0)
chronicle_projected_gdf['polygon_total_area_m2'] = chronicle_projected_gdf.area

chronicle_projected_gdf['urban_percentage'] = np.where(
    chronicle_projected_gdf['polygon_total_area_m2'] > 0,
    (chronicle_projected_gdf['urban_built_up_area_m2'] / chronicle_projected_gdf['polygon_total_area_m2']) * 100,
    0
)

# 7. Final Clean DataFrame
chronicle_urban_df = pd.DataFrame(chronicle_projected_gdf.drop(columns='geometry'))

print(f"--- Done! Processed {len(chronicle_urban_df)} records. ---")

## Save as a pickle

In [9]:
# Save the DataFrame to a pickle file at the specified path
chronicle_urban_df.to_pickle(r"D:\Development\RESEARCH\urban_flood_database\chronicle\chronicle_urban_df.pkl")