In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from metpy.calc import heat_index
from metpy.units import units

# Set base directory and parameters
weather_base = r"# path to directory containing daily weather files"
years_to_process = range(1980, 2020)
n_jobs = 12

def process_file(fpath):
    try:
        df = pd.read_parquet(fpath)
        required_cols = ['tmin', 'tmax', 'tmean', 'rh_min', 'rh_max', 'rh_mean']
        if not all(col in df.columns for col in required_cols):
            return f"Skipped {os.path.basename(fpath)} — missing columns"

        valid_mask = df[required_cols].notnull().all(axis=1)
        if valid_mask.sum() == 0:
            return f"Skipped {os.path.basename(fpath)} — no valid rows"

        tmin = df.loc[valid_mask, 'tmin'].values
        tmax = df.loc[valid_mask, 'tmax'].values
        tmean = df.loc[valid_mask, 'tmean'].values
        rh_min = df.loc[valid_mask, 'rh_min'].values
        rh_max = df.loc[valid_mask, 'rh_max'].values
        rh_mean = df.loc[valid_mask, 'rh_mean'].values

        tmin_f = tmin * 9 / 5 + 32
        tmax_f = tmax * 9 / 5 + 32
        tmean_f = tmean * 9 / 5 + 32

        hi_min = heat_index(tmin_f * units.degF, rh_max * units.percent, mask_undefined=True).to('degC').filled(np.nan)
        hi_max = heat_index(tmax_f * units.degF, rh_min * units.percent, mask_undefined=True).to('degC').filled(np.nan)
        hi_mean = heat_index(tmean_f * units.degF, rh_mean * units.percent, mask_undefined=True).to('degC').filled(np.nan)

        df.loc[valid_mask, 'heat_index_min_c'] = hi_min
        df.loc[valid_mask, 'heat_index_max_c'] = hi_max
        df.loc[valid_mask, 'heat_index_mean_c'] = hi_mean

        df.to_parquet(fpath, index=False)
        return f"{os.path.basename(fpath)} processed"

    except Exception as e:
        return f"{os.path.basename(fpath)} failed: {e}"

for year in tqdm(years_to_process, desc="Processing years"):
    year_dir = os.path.join(weather_base, f"year={year}")
    all_files = sorted([
        os.path.join(year_dir, f)
        for f in os.listdir(year_dir) if f.endswith(".parquet")
    ])

    print(f"Year {year}: {len(all_files)} files")

    results = Parallel(n_jobs=n_jobs, backend='threading')(
        delayed(process_file)(fpath) for fpath in tqdm(all_files, desc=f"HI calculation", leave=False)
    )

    for i, r in enumerate(results):
        if i % 50 == 0 or "failed" in r:
            print(r)