In [1]:
# ============================================
# 0) IMPORTS
# ============================================
import os
import glob
import pandas as pd
import numpy as np
import shapely.wkt
import shapely.geometry
import geopandas as gpd
from affine import Affine
import rasterio.features
from tqdm import tqdm  # Progress bar
import gc  # Garbage Collector for memory management

import ee

# ============================================
# 1) CONFIGURATION
# ============================================

# --- INPUT: The consolidated PKL from step 02a ---
INPUT_PKL = r"D:\Development\RESEARCH\urban_flood_database\chronicle\hydromerit_pluvial_outputs\chronicle_df_with_pfdi_FULL.pkl"

# --- OUTPUT: Where rain data will be saved ---
OUT_DIR = r"D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs"
OUT_FINAL_PKL = os.path.join(OUT_DIR, "chronicle_urban_df_with_IMERG_FULL.pkl")

# IMERG Constants
# Data available from June 2000
IMERG_START_DATE = pd.Timestamp("2000-06-01") 
SCALE = 0.1  # 0.1 Degree resolution
CRS = 'EPSG:4326'

# Batch Settings
# Keeping it safe at 100 to avoid memory overflow with 3D arrays
BATCH_SIZE = 1000 
N_BATCHES_TO_RUN = 100  # Limit execution to 30 batches (3000 events)

# ============================================
# 2) HELPERS
# ============================================

def ensure_out_dir(path):
    """Create output directory if it doesn't exist."""
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

def initialize_ee():
    """Initialize Earth Engine."""
    try:
        ee.Initialize()
        print("Earth Engine initialized.")
    except Exception:
        print("Authenticating Earth Engine...")
        ee.Authenticate()
        ee.Initialize()
        print("Earth Engine initialized after auth.")

def get_next_batch_index(out_dir):
    """
    Scans output directory for 'imerg_batch_XXXX.pkl' to determine 
    the next batch number for file naming.
    """
    if not os.path.exists(out_dir):
        return 0
    
    pattern = os.path.join(out_dir, "imerg_batch_*.pkl")
    files = glob.glob(pattern)
    
    if not files:
        return 0
    
    max_batch = -1
    for f in files:
        try:
            filename = os.path.basename(f)
            # filename format: imerg_batch_0001.pkl
            num_part = filename.split('_')[-1].split('.')[0]
            batch_num = int(num_part)
            if batch_num > max_batch:
                max_batch = batch_num
        except ValueError:
            continue
            
    return max_batch + 1

def extract_rain_data(event_row):
    """
    Extracts IMERG rain matrix, metadata, and mask for a single event.
    
    Time Window Logic:
    - Start: 72 hours BEFORE event start.
    - End: 24 hours AFTER event end.
    
    Returns: (rain_matrix, rain_mask, rain_meta)
    """
    # 1. Geometry Setup
    try:
        # Load geometry from WKT
        if isinstance(event_row['geometry_wkt'], str):
            poly_geom = shapely.wkt.loads(event_row['geometry_wkt'])
        else:
            # Fallback if it's already a geometry object
            poly_geom = event_row['geometry_wkt']
            
        bounds = poly_geom.bounds # (minx, miny, maxx, maxy)
        roi = ee.Geometry.BBox(*bounds)
        
        # 2. Time Window Calculation
        # UPDATED: Taking 72 hours prior to the start time
        start_t = event_row['start_time'] - pd.Timedelta(hours=72)
        # We keep a buffer after the end time as well (e.g., 24 hours)
        end_t = event_row['end_time'] + pd.Timedelta(hours=24) 
        
        # 3. GEE Request
        imerg_coll = ee.ImageCollection("NASA/GPM_L3/IMERG_V06") \
            .select('precipitationCal') \
            .filterBounds(roi) \
            .filterDate(start_t, end_t)
        
        # Check if collection is empty
        if imerg_coll.size().getInfo() == 0:
            return None, None, None

        # 4. Download Data (sampleRectangle)
        # Convert collection to a multi-band image (each band is a time step)
        stack = imerg_coll.toBands()
        
        # sampleRectangle downloads the raw pixels within the BBox
        pixel_dict = stack.sampleRectangle(region=roi).getInfo()
        properties = pixel_dict['properties']
        
        # 5. Parse & Stack Arrays
        # Keys are like '20000604120000_precipitationCal'
        band_keys = sorted(list(properties.keys()))
        
        arrays_list = []
        for b in band_keys:
            # Convert list to numpy array (float32 to save memory)
            arr = np.array(properties[b], dtype=np.float32)
            arrays_list.append(arr)
            
        # Stack into 3D Array: (Time, Height, Width)
        rain_matrix = np.stack(arrays_list)
        
        # 6. Create Metadata (Anchor)
        height, width = rain_matrix.shape[1], rain_matrix.shape[2]
        min_lon, min_lat, max_lon, max_lat = bounds
        
        # Transform for Rasterio (Lat/Lon)
        # Note: We align to the BBox top-left
        transform = Affine(SCALE, 0, min_lon, 0, -SCALE, max_lat)
        
        meta = {
            'origin_top_left': (max_lat, min_lon), # (Lat, Lon)
            'scale': SCALE,
            'shape': (height, width),
            'timestamps': band_keys # Store timestamp keys to map matrix layers to time
        }

        # 7. Create Binary Mask (Polygon shape on grid)
        # 1 = Inside Polygon, 0 = Outside
        mask = rasterio.features.rasterize(
            [(poly_geom, 1)],
            out_shape=(height, width),
            transform=transform,
            fill=0,
            all_touched=True,
            dtype=np.uint8
        )
        
        return rain_matrix, mask, meta

    except Exception as e:
        # If extraction fails (e.g., GEE timeout), return None
        return None, None, None

# ============================================
# 3) INITIALIZATION
# ============================================
initialize_ee()
ensure_out_dir(OUT_DIR)

# Load Input Data (The consolidated file from 02a)
print(f"Loading full dataset: {INPUT_PKL}")
if not os.path.exists(INPUT_PKL):
    raise FileNotFoundError(f"Input file not found: {INPUT_PKL}. Please run 02a first.")

df = pd.read_pickle(INPUT_PKL)

# Basic cleaning
df = df.replace([np.inf, -np.inf], np.nan)
df['start_time'] = pd.to_datetime(df['start_time'], unit='s')
df['end_time'] = pd.to_datetime(df['end_time'], unit='s')

# Filter for IMERG Era (Post June 2000)
# Events before this date will not have IMERG data
df_valid = df[df['start_time'] >= IMERG_START_DATE].copy()

print(f"Total events in input: {len(df)}")
print(f"Events valid for IMERG (post-2000): {len(df_valid)}")

# ============================================
# 4) SMART BATCH PROCESSING (ID-BASED)
# ============================================

print(f"--- PREPARING WORK PLAN ---")

# 1. Identify what is already done
# We scan the output directory for existing RAIN batches to avoid re-processing
processed_ids = set()
pkl_pattern = os.path.join(OUT_DIR, "imerg_batch_*.pkl")
existing_files = glob.glob(pkl_pattern)

if existing_files:
    print(f"Found {len(existing_files)} existing batch files. Scanning for processed IDs...")
    for f in tqdm(existing_files, desc="Indexing existing data"):
        try:
            # Only read columns needed for ID check to save memory
            df_temp = pd.read_pickle(f)
            if 'event_id' in df_temp.columns:
                processed_ids.update(df_temp['event_id'].tolist())
            del df_temp
        except Exception as e:
            print(f"Skipping corrupted file {f}: {e}")

print(f"Total events already processed: {len(processed_ids)}")

# 2. Filter the Main DataFrame
# We keep only rows whose ID is NOT in the processed set
df_todo = df_valid[~df_valid['event_id'].isin(processed_ids)].copy()

print(f"Events remaining to process: {len(df_todo)}")

if len(df_todo) == 0:
    print("All events are already processed! Nothing to do.")
else:
    # 3. Process the remaining rows in new batches
    start_batch_num = get_next_batch_index(OUT_DIR)
    n_remaining = len(df_todo)
    
    # Calculate stop limit based on N_BATCHES_TO_RUN
    max_rows_limit = N_BATCHES_TO_RUN * BATCH_SIZE
    stop_at_row = min(n_remaining, max_rows_limit)

    print(f"Plan: Processing {min(N_BATCHES_TO_RUN, n_remaining // BATCH_SIZE + 1)} batches.")
    
    # Iterate in chunks up to stop_at_row
    for batch_i in range(0, stop_at_row, BATCH_SIZE):
        
        # Determine actual batch number for filename
        current_file_num = start_batch_num + (batch_i // BATCH_SIZE)
        
        # Slice the TODO dataframe
        batch_df = df_todo.iloc[batch_i : batch_i + BATCH_SIZE].copy()
        
        print(f"\nProcessing Batch {current_file_num} ({len(batch_df)} events)...")
        
        matrices = []
        masks = []
        metas = []
        
        # Inner loop: iterate rows in current batch
        for idx, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Batch {current_file_num}"):
            mat, msk, mt = extract_rain_data(row)
            matrices.append(mat)
            masks.append(msk)
            metas.append(mt)
        
        # Assign results to columns
        batch_df['imerg_matrix'] = matrices
        batch_df['imerg_mask'] = masks
        batch_df['imerg_meta'] = metas
        
        # Save batch to disk (Pickle)
        out_path = os.path.join(OUT_DIR, f"imerg_batch_{current_file_num:04d}.pkl")
        batch_df.to_pickle(out_path)
        print(f"Saved: {out_path}")
        
        # === MEMORY CLEANUP ===
        # Explicitly delete large objects to free RAM for next iteration
        del batch_df
        del matrices
        del masks
        del metas
        gc.collect() # Force garbage collection
        # ======================

    print("\n--- BATCH LIMIT REACHED ---")
    print(f"Stopped execution after {N_BATCHES_TO_RUN} batches as requested.")

# ============================================
# 5) FINAL MERGE
# ============================================
print("\n--- FINALIZING ---")
print("Merging all batch files...")

pkl_pattern = os.path.join(OUT_DIR, "imerg_batch_*.pkl")
all_pkl_files = glob.glob(pkl_pattern)

if not all_pkl_files:
    print("No output files found.")
else:
    # Concatenate all batches
    df_list = []
    for f in tqdm(all_pkl_files, desc="Loading Batches"):
        try:
            df_list.append(pd.read_pickle(f))
        except Exception as e:
            print(f"Error loading {f}: {e}")
            
    if df_list:
        df_results = pd.concat(df_list, ignore_index=True)
        
        print("Merging results back to main dataset...")
        
        # Reload the original input again to ensure we have the clean base
        df_base = pd.read_pickle(INPUT_PKL)
        
        # Merge the new IMERG columns onto the base dataframe
        # Left join ensures we keep all original events
        df_final = df_base.merge(
            df_results[['event_id', 'imerg_matrix', 'imerg_mask', 'imerg_meta']], 
            on='event_id', 
            how='left'
        )
        
        # Save Final PKL
        df_final.to_pickle(OUT_FINAL_PKL)
        
        print(f"SUCCESS! Final dataset saved to: {OUT_FINAL_PKL}")
        
        # Verification
        if 'imerg_matrix' in df_final.columns:
            count = df_final['imerg_matrix'].notnull().sum()
            print(f"Events with valid Rain Data: {count}")
    else:
        print("Failed to load any batch files.")

Earth Engine initialized.
Loading full dataset: D:\Development\RESEARCH\urban_flood_database\chronicle\hydromerit_pluvial_outputs\chronicle_df_with_pfdi_FULL.pkl
Total events in input: 882957
Events valid for IMERG (post-2000): 882661
--- PREPARING WORK PLAN ---
Found 70 existing batch files. Scanning for processed IDs...


Indexing existing data: 100%|██████████████████████████████████████████████████████████| 70/70 [00:01<00:00, 60.55it/s]


Total events already processed: 29500
Events remaining to process: 853161
Plan: Processing 100 batches.

Processing Batch 70 (1000 events)...



Attention required for NASA/GPM_L3/IMERG_V06! You are using a deprecated asset.
To make sure your code keeps working, please update it.
This dataset has been superseded by NASA/GPM_L3/IMERG_V07

Learn more: https://developers.google.com/earth-engine/datasets/catalog/NASA_GPM_L3_IMERG_V06

Batch 70: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:17<00:00,  1.79it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0070.pkl

Processing Batch 71 (1000 events)...


Batch 71: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:10<00:00,  1.82it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0071.pkl

Processing Batch 72 (1000 events)...


Batch 72: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:28<00:00,  1.76it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0072.pkl

Processing Batch 73 (1000 events)...


Batch 73: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:21<00:00,  1.78it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0073.pkl

Processing Batch 74 (1000 events)...


Batch 74: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:27<00:00,  1.76it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0074.pkl

Processing Batch 75 (1000 events)...


Batch 75: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:21<00:00,  1.78it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0075.pkl

Processing Batch 76 (1000 events)...


Batch 76: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:40<00:00,  1.72it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0076.pkl

Processing Batch 77 (1000 events)...


Batch 77: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:29<00:00,  1.75it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0077.pkl

Processing Batch 78 (1000 events)...


Batch 78: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:47<00:00,  1.70it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0078.pkl

Processing Batch 79 (1000 events)...


Batch 79: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:22<00:00,  1.78it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0079.pkl

Processing Batch 80 (1000 events)...


Batch 80: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:34<00:00,  1.74it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0080.pkl

Processing Batch 81 (1000 events)...


Batch 81: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:12<00:00,  1.81it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0081.pkl

Processing Batch 82 (1000 events)...


Batch 82: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:29<00:00,  1.76it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0082.pkl

Processing Batch 83 (1000 events)...


Batch 83: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:01<00:00,  1.85it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0083.pkl

Processing Batch 84 (1000 events)...


Batch 84: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:19<00:00,  1.79it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0084.pkl

Processing Batch 85 (1000 events)...


Batch 85: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:08<00:00,  1.82it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0085.pkl

Processing Batch 86 (1000 events)...


Batch 86: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:28<00:00,  1.76it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0086.pkl

Processing Batch 87 (1000 events)...


Batch 87: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:17<00:00,  1.80it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0087.pkl

Processing Batch 88 (1000 events)...


Batch 88: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:21<00:00,  1.78it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0088.pkl

Processing Batch 89 (1000 events)...


Batch 89: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:57<00:00,  1.67it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0089.pkl

Processing Batch 90 (1000 events)...


Batch 90: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:30<00:00,  1.75it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0090.pkl

Processing Batch 91 (1000 events)...


Batch 91: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:29<00:00,  1.76it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0091.pkl

Processing Batch 92 (1000 events)...


Batch 92: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:08<00:00,  1.82it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0092.pkl

Processing Batch 93 (1000 events)...


Batch 93: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:18<00:00,  1.79it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0093.pkl

Processing Batch 94 (1000 events)...


Batch 94: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:51<00:00,  1.69it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0094.pkl

Processing Batch 95 (1000 events)...


Batch 95: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:27<00:00,  1.76it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0095.pkl

Processing Batch 96 (1000 events)...


Batch 96: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:21<00:00,  1.78it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0096.pkl

Processing Batch 97 (1000 events)...


Batch 97: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [09:26<00:00,  1.77it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0097.pkl

Processing Batch 98 (1000 events)...


Batch 98: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [10:00<00:00,  1.66it/s]


Saved: D:\Development\RESEARCH\urban_flood_database\chronicle\imerg_rain_outputs\imerg_batch_0098.pkl

Processing Batch 99 (1000 events)...


Batch 99:   1%|▉                                                                     | 13/1000 [00:08<10:07,  1.62it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\raznu\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\raznu\AppData\Local\Temp\ipykernel_21160\1193799961.py", line 266, in <module>
    mat, msk, mt = extract_rain_data(row)
  File "C:\Users\raznu\AppData\Local\Temp\ipykernel_21160\1193799961.py", line 123, in extract_rain_data
    if imerg_coll.size().getInfo() == 0:
  File "C:\Users\raznu\Anaconda3\lib\site-packages\ee\computedobject.py", line 107, in getInfo
    return data.computeValue(self)
  File "C:\Users\raznu\Anaconda3\lib\site-packages\ee\data.py", line 1064, in computeValue
    return _execute_cloud_call(
  File "C:\Users\raznu\Anaconda3\lib\site-packages\ee\data.py", line 349, in _execute_cloud_call
    return call.execute(num_retries=num_retries)
  File "C:\Users\raznu\Anaconda3\lib\site-packages\googleapiclient\_helpers.py", line 130, in positional_wrapper
    re

TypeError: object of type 'NoneType' has no len()

In [None]:
# ============================================
# 5) FINAL MERGE & CLEANUP
# ============================================
print("\n--- FINALIZING ---")
print("Merging all batch files...")

pkl_pattern = os.path.join(OUT_DIR, "imerg_batch_*.pkl")
all_pkl_files = glob.glob(pkl_pattern)

if not all_pkl_files:
    print("No output files found.")
else:
    # Concatenate all batches
    df_list = []
    for f in tqdm(all_pkl_files, desc="Loading Batches"):
        try:
            df_list.append(pd.read_pickle(f))
        except Exception as e:
            print(f"Error loading {f}: {e}")
            
    if df_list:
        # This DataFrame contains ONLY the processed events with rain data
        df_results = pd.concat(df_list, ignore_index=True)
        
        print(f"Loaded {len(df_results)} processed rain events (raw count).")
        print("Merging results back to main dataset structure...")
        
        # Reload the original input again to ensure we have the clean base columns
        df_base = pd.read_pickle(INPUT_PKL)
        
        # Merge: 'inner' keeps only keys that appear in BOTH DataFrames
        df_final = df_base.merge(
            df_results[['event_id', 'imerg_matrix', 'imerg_mask', 'imerg_meta']], 
            on='event_id', 
            how='inner' 
        )
        
        # --- NEW: FILTER OUT NULL RAIN DATA ---
        
        # 1. Ensure start_time is datetime format for readable printing
        if 'start_time' in df_final.columns:
            df_final['start_time'] = pd.to_datetime(df_final['start_time'], unit='s')

        # 2. Identify rows where 'imerg_matrix' is Null/None (Values of 0 are NOT null, so they stay)
        missing_rain_mask = df_final['imerg_matrix'].isnull()
        missing_events = df_final[missing_rain_mask]
        
        # 3. Print dates of dropped events
        if not missing_events.empty:
            print(f"\n[WARNING] Found {len(missing_events)} events with NULL rain data. Removing them...")
            print("--- Dropped Events Log ---")
            for idx, row in missing_events.iterrows():
                try:
                    d_str = row['start_time'].strftime('%d-%m-%Y %H:%M')
                except:
                    d_str = str(row['start_time'])
                print(f"Removing ID: {row['event_id']} | Date: {d_str}")
            
            # 4. Perform the drop
            df_final = df_final[~missing_rain_mask].copy()
            print(f"--- Cleaned. Remaining events: {len(df_final)} ---")
        else:
            print("No NULL rain events found. All processed events are valid.")

        # --------------------------------------

        # Save Final PKL (Cleaned subset)
        df_final.to_pickle(OUT_FINAL_PKL)
        
        print(f"SUCCESS! Final dataset saved to: {OUT_FINAL_PKL}")
        
        # Final Verification
        if 'imerg_matrix' in df_final.columns:
            count = df_final['imerg_matrix'].notnull().sum()
            print(f"Verified valid Rain Data events: {count}")
    else:
        print("Failed to load any batch files.")


--- FINALIZING ---
Merging all batch files...


Loading Batches: 100%|█████████████████████████████████████████████████████████████████| 99/99 [00:03<00:00, 25.66it/s]


Loaded 58500 processed rain events (raw count).
Merging results back to main dataset structure...

--- Dropped Events Log ---
Removing ID: 1134 | Date: 01-11-2000 00:00
Removing ID: 1546 | Date: 06-12-2000 00:00
Removing ID: 1639 | Date: 13-12-2000 00:00
Removing ID: 1668 | Date: 01-01-2001 00:00
Removing ID: 1724 | Date: 01-01-2001 00:00
Removing ID: 1788 | Date: 01-01-2001 00:00
Removing ID: 1984 | Date: 01-05-2001 00:00
Removing ID: 2697 | Date: 01-01-2002 00:00
Removing ID: 4777 | Date: 02-09-2003 00:00
Removing ID: 4778 | Date: 02-09-2003 00:00
Removing ID: 4779 | Date: 02-09-2003 00:00
Removing ID: 4780 | Date: 02-09-2003 00:00
Removing ID: 4783 | Date: 02-09-2003 00:00
Removing ID: 4784 | Date: 02-09-2003 00:00
Removing ID: 4785 | Date: 02-09-2003 00:00
Removing ID: 4786 | Date: 02-09-2003 00:00
Removing ID: 4788 | Date: 02-09-2003 00:00
Removing ID: 4789 | Date: 02-09-2003 00:00
Removing ID: 4790 | Date: 02-09-2003 00:00
Removing ID: 4791 | Date: 02-09-2003 00:00
Removing ID: 4

In [None]:
df_final