In [None]:
import os
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from scipy.spatial import cKDTree
from scipy.interpolate import RegularGridInterpolator
from tqdm import tqdm
from datetime import datetime

# Set base directories
era5_base = r"# path to ERA5 daily extracted parquet files"
daymet_base = r"# path to Daymet processed daily parquet files"
output_base = r"# path to save final output with interpolated values"

# Variables to interpolate
vars_to_interpolate = [
    'wind_speed_mean', 'wind_speed_max', 'wind_speed_min',
    'sp_mean', "sp_min", "sp_max"
]

def interpolate_file(daymet_year_dir, output_year_dir, fname, era5_data_cache, interpolator_cache):
    date_str = fname.replace(".parquet", "")
    daymet_path = os.path.join(daymet_year_dir, fname)
    out_path = os.path.join(output_year_dir, f"{date_str}.parquet")

    try:
        df_daymet = pd.read_parquet(daymet_path, engine="pyarrow")
        df_era5 = era5_data_cache.get(fname)
        if df_era5 is None:
            return f"ERA5 file missing: {fname}"

        df_era5_clean = df_era5.dropna(subset=vars_to_interpolate)
        if df_era5_clean.empty:
            return f"ERA5 file empty after dropping NaNs: {fname}"

        era5_lats = np.sort(df_era5_clean['lat'].unique())
        era5_lons = np.sort(df_era5_clean['lon'].unique())
        era5_coords = df_era5_clean[['lat', 'lon']].values
        daymet_coords = df_daymet[['lat', 'lon']].values

        tree = cKDTree(era5_coords)
        distances, _ = tree.query(daymet_coords, k=1)
        df_daymet['interp_dist_km'] = distances * 111
        df_daymet_valid = df_daymet[df_daymet['interp_dist_km'] <= 9].copy()

        if fname not in interpolator_cache:
            interpolator_cache[fname] = {}
            for var in vars_to_interpolate:
                pivoted = df_era5_clean.pivot(index='lat', columns='lon', values=var)
                grid = pivoted.loc[era5_lats, era5_lons].values
                interpolator = RegularGridInterpolator(
                    (era5_lats, era5_lons), grid,
                    bounds_error=False, fill_value=np.nan
                )
                interpolator_cache[fname][var] = interpolator

        for var in vars_to_interpolate:
            interpolator = interpolator_cache[fname][var]
            df_daymet_valid[f'era5_{var}'] = interpolator(df_daymet_valid[['lat', 'lon']].values)

        df_daymet_valid.to_parquet(out_path, index=False, engine="pyarrow")
        return f"{fname} complete"

    except Exception as e:
        return f"Failed on {date_str}: {e}"

if __name__ == "__main__":
    for year in range(1980, 2020):  # Update year range as needed
        daymet_year_dir = os.path.join(daymet_base, f"year={year}")
        era5_year_dir = os.path.join(era5_base, str(year))
        output_year_dir = os.path.join(output_base, f"year={year}")
        os.makedirs(output_year_dir, exist_ok=True)

        all_daymet_files = sorted([
            f for f in os.listdir(daymet_year_dir)
            if f.endswith(".parquet") and
               datetime.strptime(f.replace(".parquet", ""), "%Y-%m-%d").month in [5, 6, 7, 8, 9]
        ])

        print(f"Processing year {year} â€” {len(all_daymet_files)} days found")

        era5_data_cache = {}
        for fname in tqdm(all_daymet_files, desc="Preloading ERA5"):
            era5_path = os.path.join(era5_year_dir, fname)
            if os.path.exists(era5_path):
                try:
                    era5_data_cache[fname] = pd.read_parquet(era5_path, engine="pyarrow")
                except Exception as e:
                    print(f"Could not load {fname}: {e}")
            else:
                print(f"Missing ERA5 file: {era5_path}")

        interpolator_cache = {}

        with ThreadPoolExecutor(max_workers=12) as executor:
            futures = {
                executor.submit(
                    interpolate_file,
                    daymet_year_dir,
                    output_year_dir,
                    fname,
                    era5_data_cache,
                    interpolator_cache
                ): fname for fname in all_daymet_files
            }

            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Interpolating {year}"):
                result = future.result()
                if result.startswith("Failed") or result.startswith("ERA5"):
                    print(result)
