In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from numba import njit

# Base directory and settings
output_base = r"# path to processed weather files"
years_to_process = range(1980, 2020)
n_jobs = 12

# Relative humidity calculations
@njit
def saturation_vapor_pressure(temp_c):
    return 6.112 * np.exp((17.67 * temp_c) / (temp_c + 243.5))

@njit
def compute_rh(temp_c, vp_hpa):
    es = saturation_vapor_pressure(temp_c)
    rh = 100.0 * vp_hpa / es
    return np.clip(rh, 0.0, 100.0)

def process_file(year, fname):
    fpath = os.path.join(output_base, f"year={year}", fname)
    try:
        df = pd.read_parquet(fpath)
        required_cols = ['tmin', 'tmax', 'tmean', 'vp']
        if not all(col in df.columns for col in required_cols):
            return f"{year}/{fname} — missing required columns"

        df_valid = df[required_cols].dropna()
        if df_valid.empty:
            return f"{year}/{fname} — no valid rows"

        idx = df_valid.index
        tmin = df_valid['tmin'].values
        tmax = df_valid['tmax'].values
        tmean = df_valid['tmean'].values
        vp = df_valid['vp'].values / 100  # Pa to hPa

        df.loc[idx, 'rh_min'] = compute_rh(tmax, vp)
        df.loc[idx, 'rh_max'] = compute_rh(tmin, vp)
        df.loc[idx, 'rh_mean'] = compute_rh(tmean, vp)

        df.to_parquet(fpath, index=False)
        return f"{year}/{fname} processed"

    except Exception as e:
        return f"{year}/{fname} failed: {e}"

for year in tqdm(years_to_process, desc="Processing years"):
    year_dir = os.path.join(output_base, f"year={year}")
    if not os.path.exists(year_dir):
        print(f"Skipping year {year} — directory not found")
        continue

    all_files = sorted([f for f in os.listdir(year_dir) if f.endswith(".parquet")])
    print(f"Year {year}: {len(all_files)} files found")

    results = Parallel(n_jobs=n_jobs, backend='loky')(
        delayed(process_file)(year, f) for f in tqdm(all_files, desc=f"{year}", leave=False)
    )

    for r in results:
        print(r)