In [None]:
import os
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed

# Set base directories and parameters
weather_base = r"# path to weather files with heat stress"
czda_base = r"# path to CZDA parquet files (e.g., from 2000)"
years_to_process = range(1980, 2020)
n_jobs = 12

def load_dedup_czda(f):
    if not f.endswith(".parquet"):
        return None, None
    key = f.split("_")[0]
    czda_path = os.path.join(czda_base, f)
    try:
        df = pd.read_parquet(czda_path, columns=["lat", "lon", "czda_min", "czda_mean", "czda_max"])
        df = df.drop_duplicates(subset=["lat", "lon"], keep="first")
        return key, df
    except Exception as e:
        print(f"Could not load {f}: {e}")
        return None, None

def is_may_to_september(f):
    try:
        if not f.endswith(".parquet"):
            return False
        month = int(f.split("-")[1])
        return 5 <= month <= 9
    except:
        return False

# Load CZDA files
print("Preloading CZDA files...")
czda_files = [f for f in os.listdir(czda_base) if is_may_to_september(f)]
czda_data = Parallel(n_jobs=n_jobs)(
    delayed(load_dedup_czda)(f) for f in tqdm(czda_files, desc="Loading CZDA files")
)
czda_dict = {k: v for k, v in czda_data if k is not None}

for year in years_to_process:
    print(f"\nProcessing year {year}...")
    year_dir = os.path.join(weather_base, f"year={year}")
    
    if not os.path.exists(year_dir):
        print(f"Skipping year {year} — directory not found.")
        continue

    def process_file(fname):
        try:
            date_str = fname.replace(".parquet", "")
            month_day = "-".join(date_str.split("-")[1:])
            czda_key = f"2000-{month_day}"
            input_path = os.path.join(year_dir, fname)

            if czda_key not in czda_dict:
                return f"{fname} — CZDA not available"

            czda_df = czda_dict[czda_key]
            df = pd.read_parquet(input_path)
            df_merged = df.merge(czda_df, on=["lat", "lon"], how="left")

            valid_mask = (
                (df_merged['czda_min'] > 0) &
                (df_merged['czda_mean'] > 0) &
                (df_merged['czda_max'] > 0)
            )
            filtered_df = df_merged[valid_mask]

            if not filtered_df.empty:
                filtered_df.to_parquet(input_path, index=False)

            return f"{fname} processed, rows kept: {len(filtered_df)}"

        except Exception as e:
            return f"{fname} → {e}"

    all_files = sorted([f for f in os.listdir(year_dir) if f.endswith(".parquet")])
    print(f"{len(all_files)} files found for year {year}")

    results = Parallel(n_jobs=n_jobs)(
        delayed(process_file)(fname) for fname in tqdm(all_files, desc=f"Merging CZDA for {year}")
    )

    print(f"\nSummary for {year}:")
    for r in results:
        print(r)