In [None]:
import os
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed

# Set base path and parameters
weather_base = r"# path to weather data directory"
years_to_process = range(1980, 2020)
n_jobs = 12

def process_file(fpath):
    fname = os.path.basename(fpath)
    try:
        df = pd.read_parquet(fpath, engine='pyarrow')
        if 'tmin' in df.columns and 'tmax' in df.columns:
            df['tmean'] = (df['tmin'] + df['tmax']) / 2
            df.to_parquet(fpath, index=False, engine='pyarrow')
            return f"{fname} — tmean added"
        else:
            return f"{fname} — missing tmin or tmax"
    except Exception as e:
        return f"{fname} — error: {e}"

for year in tqdm(years_to_process, desc="Processing years"):
    year_dir = os.path.join(weather_base, f"year={year}")
    if not os.path.exists(year_dir):
        print(f"Skipping {year}: folder not found.")
        continue

    all_files = sorted([
        os.path.join(year_dir, f) for f in os.listdir(year_dir) if f.endswith(".parquet")
    ])
    print(f"Year {year}: {len(all_files)} files found")

    results = Parallel(n_jobs=n_jobs)(
        delayed(process_file)(f) for f in tqdm(all_files, desc=f"Adding tmean", leave=False)
    )

    print(f"\nSummary for {year}:")
    for r in results:
        print(r)