In [None]:
# Core libs for this workflow
!pip -q install python-dotenv requests tenacity pandas numpy pyarrow fastparquet pytz

# Optional (only if you enable ERA5 later)
!pip -q install cdsapi xarray netcdf4

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, sys, time, json, math, datetime

# (Optional) bump throttling slightly if you still see 429s or stalls
try:
    PER_SITE_SLEEP_S = max(float(globals().get("PER_SITE_SLEEP_S", 0.7)), 0.7)
except Exception:
    PER_SITE_SLEEP_S = 0.7

print("time imported. PER_SITE_SLEEP_S =", PER_SITE_SLEEP_S)

time imported. PER_SITE_SLEEP_S = 0.7


In [None]:
# --- Cell 1: Colab mount + .env load + global config (with throttling) ---

# 1) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 2) Env loading (install python-dotenv if needed)
try:
    import dotenv  # type: ignore
except ImportError:
    %pip -q install python-dotenv
    import dotenv

import os
from pathlib import Path
import datetime, sys
import pandas as pd

# Path to your .env in Drive
ENV_PATH = "/content/drive/MyDrive/Colab Notebooks/Dataset.env"
dotenv.load_dotenv(ENV_PATH)

# --- Credentials / Tokens ---
# NSRDB (required if you want NSRDB; ERA5 does not need these)
NREL_API_KEY     = os.getenv("NREL_API_KEY")
NREL_EMAIL       = os.getenv("NREL_EMAIL")           # e.g., your real email
NREL_FULL_NAME   = os.getenv("NREL_FULL_NAME")       # e.g., "Raghvendra Dubey"
NREL_AFFILIATION = os.getenv("NREL_AFFILIATION")     # e.g., "Walsh University"
NREL_REASON      = os.getenv("NREL_REASON", "research")

# Renewables Ninja (optional; not used past 2019 in this pipeline)
RN_TOKEN         = os.getenv("RENEWABLES_NINJA_TOKEN") or os.getenv("RN_TOKEN")

# Optional EIA/ENTSO-E (unused in this aggregation step)
EIA_API_KEY      = os.getenv("EIA_API_KEY")

# --- Time window (END is exclusive) ---
START = "2022-01-01T00:00:00Z"
END   = "2024-01-01T00:00:00Z"   # covers 2022 & 2023

# --- Throttling (important to avoid 429/timeouts) ---
PER_SITE_SLEEP_S = 0.7  # polite delay between sites; used by process_region()

# --- Output / Cache / Checkpoints ---
OUT   = Path("out");         OUT.mkdir(parents=True, exist_ok=True)
CACHE = Path("cache_era5");  CACHE.mkdir(parents=True, exist_ok=True)
CKPT  = Path("checkpoints"); CKPT.mkdir(parents=True, exist_ok=True)

# Consistent, UTC-aware logger (avoids Deprecation warnings)
def log(msg: str):
    ts = datetime.datetime.now(datetime.UTC).strftime("[%H:%M:%S]")
    print(f"{ts} {msg}")
    sys.stdout.flush()
#compute year ranges + NSRDB cap
def years_in_range(start_iso: str, end_iso: str):
    ys = pd.to_datetime(start_iso).year
    ye = pd.to_datetime(end_iso).year
    return list(range(ys, ye + 1))

ALL_YEARS = years_in_range(START, END)

# NSRDB PSM3 historical availability
NSRDB_MIN_YEAR = 1998
NSRDB_MAX_YEAR = 2020

NSRDB_YEARS = [y for y in ALL_YEARS if NSRDB_MIN_YEAR <= y <= NSRDB_MAX_YEAR]

log(f"ALL_YEARS={ALL_YEARS} | NSRDB_YEARS(clipped)={NSRDB_YEARS} (cap {NSRDB_MIN_YEAR}-{NSRDB_MAX_YEAR})")

# Sanity print
log(f"Env loaded from: {ENV_PATH}")
log(f"NREL_API_KEY set? {'yes' if NREL_API_KEY else 'no'}; "
    f"NREL_EMAIL set? {'yes' if NREL_EMAIL else 'no'}; "
    f"NREL_FULL_NAME set? {'yes' if NREL_FULL_NAME else 'no'}; "
    f"NREL_AFFILIATION set? {'yes' if NREL_AFFILIATION else 'no'}; "
    f"NREL_REASON set? {'yes' if NREL_REASON else 'no'}"
    f"RN_TOKEN set? {'yes' if RN_TOKEN else 'no'}")
log(f"Window: {START} → {END} | PER_SITE_SLEEP_S={PER_SITE_SLEEP_S}")
log(f"Dirs: OUT={OUT.resolve()} | CACHE={CACHE.resolve()} | CKPT={CKPT.resolve()}")


Mounted at /content/drive
[00:38:45] ALL_YEARS=[2022, 2023, 2024] | NSRDB_YEARS(clipped)=[] (cap 1998-2020)
[00:38:45] Env loaded from: /content/drive/MyDrive/Colab Notebooks/Dataset.env
[00:38:45] NREL_API_KEY set? yes; NREL_EMAIL set? yes; NREL_FULL_NAME set? yes; NREL_AFFILIATION set? yes; NREL_REASON set? yesRN_TOKEN set? yes
[00:38:45] Window: 2022-01-01T00:00:00Z → 2024-01-01T00:00:00Z | PER_SITE_SLEEP_S=0.7
[00:38:45] Dirs: OUT=/content/out | CACHE=/content/cache_era5 | CKPT=/content/checkpoints


In [None]:
# ====================================================
# Cell 2 — Site Registry (Direct from Python list)
# ====================================================
import numpy as np

# Full site list (Europe, Americas, Asia-Pacific, Australia)
# Copied from your registry definition
SITES = [
    # =========================
    # EUROPE — 50 sites
    # =========================
    # --- Germany (10) — North Sea/Baltic wind hubs + solar in Bavaria/BW ---
    {"site_id":"DE_Cuxhaven",        "lat":53.8619, "lon": 8.6947,  "elev_m":np.nan, "country":"DE"},
    {"site_id":"DE_Bremerhaven",     "lat":53.5396, "lon": 8.5809,  "elev_m":np.nan, "country":"DE"},
    {"site_id":"DE_Emden",           "lat":53.3674, "lon": 7.2078,  "elev_m":np.nan, "country":"DE"},
    {"site_id":"DE_Husum",           "lat":54.4858, "lon": 9.0526,  "elev_m":np.nan, "country":"DE"},
    {"site_id":"DE_Wilhelmshaven",   "lat":53.5219, "lon": 8.1064,  "elev_m":np.nan, "country":"DE"},
    {"site_id":"DE_Sassnitz_Ruegen", "lat":54.5151, "lon":13.6434,  "elev_m":np.nan, "country":"DE"},
    {"site_id":"DE_Bayreuth",        "lat":49.9456, "lon":11.5713,  "elev_m":np.nan, "country":"DE"},
    {"site_id":"DE_Ingolstadt",      "lat":48.7665, "lon":11.4258,  "elev_m":np.nan, "country":"DE"},
    {"site_id":"DE_Freiburg",        "lat":47.9990, "lon": 7.8421,  "elev_m":np.nan, "country":"DE"},
    {"site_id":"DE_Leipzig",         "lat":51.3397, "lon":12.3731,  "elev_m":np.nan, "country":"DE"},

    # --- France (10) — Offshore wind ports + high-insolation south ---
    {"site_id":"FR_Dunkerque",       "lat":51.0344, "lon": 2.3768,  "elev_m":np.nan, "country":"FR"},
    {"site_id":"FR_SaintNazaire",    "lat":47.2735, "lon":-2.2137,  "elev_m":np.nan, "country":"FR"},
    {"site_id":"FR_Cherbourg",       "lat":49.6337, "lon":-1.6221,  "elev_m":np.nan, "country":"FR"},
    {"site_id":"FR_LeHavre",         "lat":49.4944, "lon": 0.1079,  "elev_m":np.nan, "country":"FR"},
    {"site_id":"FR_Marseille",       "lat":43.2965, "lon": 5.3698,  "elev_m":np.nan, "country":"FR"},
    {"site_id":"FR_Montpellier",     "lat":43.6117, "lon": 3.8777,  "elev_m":np.nan, "country":"FR"},
    {"site_id":"FR_Perpignan",       "lat":42.6887, "lon": 2.8948,  "elev_m":np.nan, "country":"FR"},
    {"site_id":"FR_Toulouse",        "lat":43.6047, "lon": 1.4442,  "elev_m":np.nan, "country":"FR"},
    {"site_id":"FR_Bordeaux",        "lat":44.8378, "lon":-0.5792,  "elev_m":np.nan, "country":"FR"},
    {"site_id":"FR_Narbonne",        "lat":43.1843, "lon": 3.0031,  "elev_m":np.nan, "country":"FR"},

    # --- Spain (10) — Extremadura/Andalusia solar + Ebro/Navarra wind ---
    {"site_id":"ES_Seville",         "lat":37.3891, "lon":-5.9845,  "elev_m":np.nan, "country":"ES"},
    {"site_id":"ES_Merida",          "lat":38.9159, "lon":-6.3455,  "elev_m":np.nan, "country":"ES"},
    {"site_id":"ES_Caceres",         "lat":39.4763, "lon":-6.3722,  "elev_m":np.nan, "country":"ES"},
    {"site_id":"ES_Puertollano",     "lat":38.6871, "lon":-4.1073,  "elev_m":np.nan, "country":"ES"},
    {"site_id":"ES_Talayuela",       "lat":39.9872, "lon":-5.6209,  "elev_m":np.nan, "country":"ES"},
    {"site_id":"ES_Zaragoza",        "lat":41.6488, "lon":-0.8891,  "elev_m":np.nan, "country":"ES"},
    {"site_id":"ES_Pamplona",        "lat":42.8125, "lon":-1.6458,  "elev_m":np.nan, "country":"ES"},
    {"site_id":"ES_Burgos",          "lat":42.3439, "lon":-3.6969,  "elev_m":np.nan, "country":"ES"},
    {"site_id":"ES_Almeria",         "lat":36.8340, "lon":-2.4637,  "elev_m":np.nan, "country":"ES"},
    {"site_id":"ES_Valencia",        "lat":39.4699, "lon":-0.3763,  "elev_m":np.nan, "country":"ES"},

    # --- Rest of Europe (20) — UK/NL/DK/IE/PT/IT/GR/PL/SE/NO/FI/BE/CZ/AT/HU/RO/BG/HR ---
    {"site_id":"UK_Grimsby",         "lat":53.5654, "lon":-0.0755,  "elev_m":np.nan, "country":"GB"},
    {"site_id":"UK_Hull",            "lat":53.7457, "lon":-0.3367,  "elev_m":np.nan, "country":"GB"},
    {"site_id":"UK_Aberdeen",        "lat":57.1497, "lon":-2.0943,  "elev_m":np.nan, "country":"GB"},
    {"site_id":"UK_Newcastle_Blyth", "lat":55.1272, "lon":-1.5086,  "elev_m":np.nan, "country":"GB"},
    {"site_id":"NL_Eemshaven",       "lat":53.4488, "lon": 6.8315,  "elev_m":np.nan, "country":"NL"},
    {"site_id":"NL_IJmuiden",        "lat":52.4600, "lon": 4.6129,  "elev_m":np.nan, "country":"NL"},
    {"site_id":"DK_Esbjerg",         "lat":55.4767, "lon": 8.4599,  "elev_m":np.nan, "country":"DK"},
    {"site_id":"IE_Galway",          "lat":53.2707, "lon":-9.0568,  "elev_m":np.nan, "country":"IE"},
    {"site_id":"PT_Evora",           "lat":38.5667, "lon":-7.9000,  "elev_m":np.nan, "country":"PT"},
    {"site_id":"PT_Sines",           "lat":37.9560, "lon":-8.8699,  "elev_m":np.nan, "country":"PT"},
    {"site_id":"IT_Catania",         "lat":37.5079, "lon":15.0830,  "elev_m":np.nan, "country":"IT"},
    {"site_id":"IT_MontaltoDiCastro","lat":42.3535, "lon":11.6082,  "elev_m":np.nan, "country":"IT"},
    {"site_id":"GR_Kozani",          "lat":40.3012, "lon":21.7870,  "elev_m":np.nan, "country":"GR"},
    {"site_id":"PL_Koszalin",        "lat":54.1949, "lon":16.1722,  "elev_m":np.nan, "country":"PL"},
    {"site_id":"SE_Pitea",           "lat":65.3172, "lon":21.4794,  "elev_m":np.nan, "country":"SE"},
    {"site_id":"NO_Stavanger",       "lat":58.9690, "lon": 5.7331,  "elev_m":np.nan, "country":"NO"},
    {"site_id":"FI_Oulu",            "lat":65.0121, "lon":25.4651,  "elev_m":np.nan, "country":"FI"},
    {"site_id":"BE_Zeebrugge",       "lat":51.3300, "lon": 3.2050,  "elev_m":np.nan, "country":"BE"},
    {"site_id":"RO_Constanta_Dobrogea","lat":44.1598,"lon":28.6348, "elev_m":np.nan, "country":"RO"},
    {"site_id":"BG_Kavarna",         "lat":43.4314, "lon":28.3412,  "elev_m":np.nan, "country":"BG"},
    {"site_id":"HR_Senj",            "lat":44.9890, "lon":14.9057,  "elev_m":np.nan, "country":"HR"},
    {"site_id":"CZ_Prague",          "lat":50.0755, "lon":14.4378,  "elev_m":np.nan, "country":"CZ"},
    {"site_id":"AT_Vienna",          "lat":48.2082, "lon":16.3738,  "elev_m":np.nan, "country":"AT"},
    {"site_id":"HU_Budapest",        "lat":47.4979, "lon":19.0402,  "elev_m":np.nan, "country":"HU"},

    # =========================
    # AMERICAS — 50 sites
    # =========================
    # --- United States (30) — Desert Southwest solar + Great Plains/TX/Midwest wind + offshore staging ---
    {"site_id":"US_CA_Mojave",           "lat":34.9086, "lon":-118.1737, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_CA_Tehachapi",        "lat":35.1322, "lon":-118.4486, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_CA_PalmSprings",      "lat":33.8303, "lon":-116.5453, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_CA_DesertCenter",     "lat":33.7111, "lon":-115.3300, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_CA_Blythe",           "lat":33.6103, "lon":-114.5974, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_NV_Tonopah",          "lat":38.0675, "lon":-117.2308, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_AZ_GilaBend",         "lat":32.9470, "lon":-112.7160, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_AZ_Yuma",             "lat":32.6927, "lon":-114.6277, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_NM_Albuquerque",      "lat":35.0844, "lon":-106.6504, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_TX_Roscoe",           "lat":32.4460, "lon":-100.5393, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_TX_Sweetwater",       "lat":32.4709, "lon":-100.4060, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_TX_Lubbock",          "lat":33.5779, "lon":-101.8552, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_TX_Midland",          "lat":31.9974, "lon":-102.0779, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_TX_Odessa",           "lat":31.8457, "lon":-102.3676, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_TX_CorpusChristi",    "lat":27.8006, "lon":-97.3964,  "elev_m":np.nan, "country":"US"},
    {"site_id":"US_OK_Woodward",         "lat":36.4337, "lon":-99.3904,  "elev_m":np.nan, "country":"US"},
    {"site_id":"US_KS_DodgeCity",        "lat":37.7528, "lon":-100.0171, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_IA_StormLake",        "lat":42.6447, "lon":-95.2080,  "elev_m":np.nan, "country":"US"},
    {"site_id":"US_SD_Huron",            "lat":44.3633, "lon":-98.2143,  "elev_m":np.nan, "country":"US"},
    {"site_id":"US_ND_Fargo",            "lat":46.8772, "lon":-96.7898,  "elev_m":np.nan, "country":"US"},
    {"site_id":"US_CO_Pueblo",           "lat":38.2544, "lon":-104.6091, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_WY_Rawlins",          "lat":41.7911, "lon":-107.2387, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_OR_Boardman",         "lat":45.8393, "lon":-119.7006, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_WA_Goldendale",       "lat":45.8207, "lon":-120.8215, "elev_m":np.nan, "country":"US"},
    {"site_id":"US_MN_BuffaloRidge",     "lat":44.3350, "lon":-96.0110,  "elev_m":np.nan, "country":"US"},
    {"site_id":"US_IL_Bloomington",      "lat":40.4842, "lon":-88.9937,  "elev_m":np.nan, "country":"US"},
    {"site_id":"US_NY_Buffalo",          "lat":42.8864, "lon":-78.8784,  "elev_m":np.nan, "country":"US"},
    {"site_id":"US_MA_NewBedford",       "lat":41.6362, "lon":-70.9342,  "elev_m":np.nan, "country":"US"},
    {"site_id":"US_FL_DeSoto",           "lat":27.2117, "lon":-81.8065,  "elev_m":np.nan, "country":"US"},
    {"site_id":"US_NC_ElizabethCity",    "lat":36.2946, "lon":-76.2510,  "elev_m":np.nan, "country":"US"},

    # --- Canada (5) — ON/QC wind, AB/SK wind, NS wind ---
    {"site_id":"CA_ChathamKent_ON",      "lat":42.4048, "lon":-82.1910,  "elev_m":np.nan, "country":"CA"},
    {"site_id":"CA_Gaspe_QC",            "lat":48.8351, "lon":-64.4819,  "elev_m":np.nan, "country":"CA"},
    {"site_id":"CA_PinchCreek_AB",       "lat":49.4827, "lon":-113.9496, "elev_m":np.nan, "country":"CA"},
    {"site_id":"CA_SwiftCurrent_SK",     "lat":50.2851, "lon":-107.8010, "elev_m":np.nan, "country":"CA"},
    {"site_id":"CA_Pictou_NS",           "lat":45.6780, "lon":-62.7113,  "elev_m":np.nan, "country":"CA"},

    # --- Latin America (15) — MX Oaxaca wind; BR NE wind/solar; CL Atacama solar; AR Patagonia wind; CO/PE/UY etc. ---
    {"site_id":"MX_LaVentosa_Oaxaca",    "lat":16.5850, "lon":-94.9840,  "elev_m":np.nan, "country":"MX"},
    {"site_id":"MX_PuertoPenasco_Son",   "lat":31.3267, "lon":-113.5312, "elev_m":np.nan, "country":"MX"},
    {"site_id":"BR_Caetite_Bahia",       "lat":-14.0703,"lon":-42.4815,  "elev_m":np.nan, "country":"BR"},
    {"site_id":"BR_CurraisNovos_RN",     "lat":-6.2548, "lon":-36.5147,  "elev_m":np.nan, "country":"BR"},
    {"site_id":"BR_Picos_Piaui",         "lat":-7.0773, "lon":-41.4670,  "elev_m":np.nan, "country":"BR"},
    {"site_id":"CL_Calama_Antofagasta",  "lat":-22.4540,"lon":-68.9290,  "elev_m":np.nan, "country":"CL"},
    {"site_id":"CL_Copiapó_Atacama",     "lat":-27.3668,"lon":-70.3322,  "elev_m":np.nan, "country":"CL"},
    {"site_id":"AR_ComodoroRivadavia",   "lat":-45.8641,"lon":-67.4805,  "elev_m":np.nan, "country":"AR"},
    {"site_id":"CO_Uribia_Guajira",      "lat":11.7150, "lon":-72.2650,  "elev_m":np.nan, "country":"CO"},
    {"site_id":"PE_Moquegua",            "lat":-17.1940,"lon":-70.9340,  "elev_m":np.nan, "country":"PE"},
    {"site_id":"UY_Tacuarembo",          "lat":-31.7167,"lon":-55.9833,  "elev_m":np.nan, "country":"UY"},
    {"site_id":"CR_Guanacaste",          "lat":10.2600, "lon":-85.5850,  "elev_m":np.nan, "country":"CR"},
    {"site_id":"DO_MonteCristi",         "lat":19.8488, "lon":-71.6450,  "elev_m":np.nan, "country":"DO"},
    {"site_id":"PA_Penonome",            "lat":8.5189,  "lon":-80.3563,  "elev_m":np.nan, "country":"PA"},
    {"site_id":"MX_Reynosa_Tamaulipas",  "lat":26.0500, "lon":-98.2833,  "elev_m":np.nan, "country":"MX"},

    # =========================
    # ASIA–PACIFIC — 50 sites
    # =========================
    # --- India (10) — Ultra-mega solar + TN/GJ wind belts ---
    {"site_id":"IN_Bhadla_Rajasthan",    "lat":27.5296, "lon": 71.9363,  "elev_m":np.nan, "country":"IN"},
    {"site_id":"IN_Charanka_Gujarat",    "lat":23.9446, "lon": 71.1111,  "elev_m":np.nan, "country":"IN"},
    {"site_id":"IN_Rewa_MadhyaPradesh",  "lat":24.5373, "lon": 81.3000,  "elev_m":np.nan, "country":"IN"},
    {"site_id":"IN_Pavagada_Karnataka",  "lat":14.0980, "lon": 77.2591,  "elev_m":np.nan, "country":"IN"},
    {"site_id":"IN_Kurnool_AP",          "lat":15.7016, "lon": 78.1048,  "elev_m":np.nan, "country":"IN"},
    {"site_id":"IN_Kadapa_AP",           "lat":14.4674, "lon": 78.8241,  "elev_m":np.nan, "country":"IN"},
    {"site_id":"IN_Kamuthi_TN",          "lat":9.3636,  "lon": 78.3864,  "elev_m":np.nan, "country":"IN"},
    {"site_id":"IN_Jaisalmer_RJ",        "lat":26.9157, "lon": 70.9083,  "elev_m":np.nan, "country":"IN"},
    {"site_id":"IN_Kayathar_TN",         "lat":9.0830,  "lon": 77.7740,  "elev_m":np.nan, "country":"IN"},
    {"site_id":"IN_Bhuj_Gujarat",        "lat":23.2419, "lon": 69.6669,  "elev_m":np.nan, "country":"IN"},

    # --- China (10) — NW wind/solar bases + N China wind corridors ---
    {"site_id":"CN_Hami_XJ",             "lat":42.8260, "lon": 93.5150,  "elev_m":np.nan, "country":"CN"},
    {"site_id":"CN_Golmud_QH",           "lat":36.4167, "lon": 94.9167,  "elev_m":np.nan, "country":"CN"},
    {"site_id":"CN_Delingha_QH",         "lat":37.3694, "lon": 97.3694,  "elev_m":np.nan, "country":"CN"},
    {"site_id":"CN_Jiuquan_Gansu",       "lat":39.7320, "lon": 98.4939,  "elev_m":np.nan, "country":"CN"},
    {"site_id":"CN_Zhangbei_Hebei",      "lat":41.1579, "lon":114.7150,  "elev_m":np.nan, "country":"CN"},
    {"site_id":"CN_Dabancheng_Urumqi",   "lat":43.2000, "lon": 88.3167,  "elev_m":np.nan, "country":"CN"},
    {"site_id":"CN_Yinchuan_Ningxia",    "lat":38.4872, "lon":106.2309,  "elev_m":np.nan, "country":"CN"},
    {"site_id":"CN_Tongliao_IM",         "lat":43.6174, "lon":122.2650,  "elev_m":np.nan, "country":"CN"},
    {"site_id":"CN_Cangzhou_HB",         "lat":38.3047, "lon":116.8387,  "elev_m":np.nan, "country":"CN"},
    {"site_id":"CN_Dezhou_SD",           "lat":37.4513, "lon":116.3105,  "elev_m":np.nan, "country":"CN"},

    # --- Japan (6) — wind (Hokkaido/Tohoku) + mega-solar in south ---
    {"site_id":"JP_Ishikari_Hokkaido",   "lat":43.2397, "lon":141.3539,  "elev_m":np.nan, "country":"JP"},
    {"site_id":"JP_Tsugaru_Aomori",      "lat":40.8040, "lon":140.4413,  "elev_m":np.nan, "country":"JP"},
    {"site_id":"JP_Noshiro_Akita",       "lat":40.2059, "lon":140.0230,  "elev_m":np.nan, "country":"JP"},
    {"site_id":"JP_Minamisoma_Fukushima","lat":37.6421, "lon":140.9576,  "elev_m":np.nan, "country":"JP"},
    {"site_id":"JP_Kagoshima",           "lat":31.5966, "lon":130.5571,  "elev_m":np.nan, "country":"JP"},
    {"site_id":"JP_Goto_Nagasaki",       "lat":32.6958, "lon":128.8400,  "elev_m":np.nan, "country":"JP"},

    # --- South Korea (4) — Jeolla/Jeju offshore/floating + Ulsan ---
    {"site_id":"KR_Jeju",                "lat":33.4996, "lon":126.5312,  "elev_m":np.nan, "country":"KR"},
    {"site_id":"KR_Sinan_Jeolla",        "lat":34.8262, "lon":125.9867,  "elev_m":np.nan, "country":"KR"},
    {"site_id":"KR_Saemangeum",          "lat":35.7614, "lon":126.6195,  "elev_m":np.nan, "country":"KR"},
    {"site_id":"KR_Ulsan",               "lat":35.5384, "lon":129.3114,  "elev_m":np.nan, "country":"KR"},

    # --- Australia (4) — SA wind + NSW solar + QLD wind + VIC wind ---
    {"site_id":"AU_Hornsdale_SA",        "lat":-33.1710, "lon":138.4820, "elev_m":np.nan, "country":"AU"},
    {"site_id":"AU_Nyngan_NSW",          "lat":-31.5633, "lon":147.1939, "elev_m":np.nan, "country":"AU"},
    {"site_id":"AU_CoopersGap_QLD",      "lat":-26.9330, "lon":151.3380, "elev_m":np.nan, "country":"AU"},
    {"site_id":"AU_Macarthur_VIC",       "lat":-38.0340, "lon":142.0030, "elev_m":np.nan, "country":"AU"},

    # --- ASEAN/Taiwan/NZ (16) — mix of wind/solar leaders ---
    {"site_id":"SG_Tengeh",              "lat": 1.3554, "lon":103.7036,  "elev_m":np.nan, "country":"SG"},
    {"site_id":"TH_NakhonRatchasima",    "lat":14.9799, "lon":102.0977,  "elev_m":np.nan, "country":"TH"},
    {"site_id":"VN_NinhThuan",           "lat":11.7167, "lon":108.8333,  "elev_m":np.nan, "country":"VN"},
    {"site_id":"VN_BacLieu",             "lat": 9.2861, "lon":105.7244,  "elev_m":np.nan, "country":"VN"},
    {"site_id":"PH_Ilocos_Burgos",       "lat":18.5143, "lon":120.6506,  "elev_m":np.nan, "country":"PH"},
    {"site_id":"PH_Bangui",              "lat":18.5403, "lon":120.7687,  "elev_m":np.nan, "country":"PH"},
    {"site_id":"ID_Sidrap_SulSel",       "lat":-3.9389, "lon":120.1244,  "elev_m":np.nan, "country":"ID"},
    {"site_id":"ID_Cirata_JawaBarat",    "lat":-6.7167, "lon":107.3333,  "elev_m":np.nan, "country":"ID"},
    {"site_id":"MY_Kedah_Kulim",         "lat": 5.3640, "lon":100.5610,  "elev_m":np.nan, "country":"MY"},
    {"site_id":"MY_Sabah_Kudat",         "lat": 6.8836, "lon":116.8451,  "elev_m":np.nan, "country":"MY"},
    {"site_id":"TW_Changhua",            "lat":24.0667, "lon":120.5333,  "elev_m":np.nan, "country":"TW"},
    {"site_id":"TW_Tainan",              "lat":23.0000, "lon":120.2000,  "elev_m":np.nan, "country":"TW"},
    {"site_id":"NZ_PalmerstonNorth",     "lat":-40.3523, "lon":175.6082, "elev_m":np.nan, "country":"NZ"},
    {"site_id":"NZ_Taranaki",            "lat":-39.0556, "lon":174.0742, "elev_m":np.nan, "country":"NZ"},
    {"site_id":"KH_KampongSpeu",         "lat":11.4530, "lon":104.5200,  "elev_m":np.nan, "country":"KH"},
    {"site_id":"MM_Minbu",               "lat":20.1833, "lon":94.8833,   "elev_m":np.nan, "country":"MM"},
]

print(f"Loaded {len(SITES)} sites into registry (from inline definition).")
print("Examples:", SITES[:3])


Loaded 154 sites into registry (from inline definition).
Examples: [{'site_id': 'DE_Cuxhaven', 'lat': 53.8619, 'lon': 8.6947, 'elev_m': nan, 'country': 'DE'}, {'site_id': 'DE_Bremerhaven', 'lat': 53.5396, 'lon': 8.5809, 'elev_m': nan, 'country': 'DE'}, {'site_id': 'DE_Emden', 'lat': 53.3674, 'lon': 7.2078, 'elev_m': nan, 'country': 'DE'}]


In [None]:
# ====================================================
# Cell 3 — Region mapping (EUROPE / AMERICAS / ASIA / AUSTRALIA)
# ====================================================

import pandas as pd  # ensure pandas is available

EU_COUNTRIES = {
    "DE","FR","ES","PT","IT","GR","NL","DK","GB","IE","PL","SE","NO","FI","BE",
    "CZ","AT","HU","RO","BG","HR","LT","LV","EE","LU","SI","SK"
}
AMER_COUNTRIES = {"US","CA","MX","BR","CL","AR","CO","PE","UY","CR","DO","PA"}
AUSTRALIA_COUNTRIES = {"AU","NZ"}

def region_of_country(cc: str) -> str:
    if cc in EU_COUNTRIES: return "EUROPE"
    if cc in AMER_COUNTRIES: return "AMERICAS"
    if cc in AUSTRALIA_COUNTRIES: return "AUSTRALIA"
    return "ASIA"

# Build REGISTRY dataframe from the SITES list defined in Cell 2
REGISTRY = pd.DataFrame(SITES).copy()
if "country" not in REGISTRY.columns:
    raise RuntimeError("Each SITES entry must include 2-letter 'country'.")

REGISTRY["region_group"] = REGISTRY["country"].map(region_of_country)

print(REGISTRY["region_group"].value_counts(dropna=False))
display(REGISTRY.head())


region_group
EUROPE       54
AMERICAS     50
ASIA         44
AUSTRALIA     6
Name: count, dtype: int64


Unnamed: 0,site_id,lat,lon,elev_m,country,region_group
0,DE_Cuxhaven,53.8619,8.6947,,DE,EUROPE
1,DE_Bremerhaven,53.5396,8.5809,,DE,EUROPE
2,DE_Emden,53.3674,7.2078,,DE,EUROPE
3,DE_Husum,54.4858,9.0526,,DE,EUROPE
4,DE_Wilhelmshaven,53.5219,8.1064,,DE,EUROPE


In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def log(msg: str):
    ts = datetime.datetime.now(datetime.UTC).strftime("[%H:%M:%S]")
    print(f"{ts} {msg}")

# HTTP session with retries/backoff (handles 429, 5xx, and Retry-After)
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def _make_session():
    sess = requests.Session()
    retry = Retry(
        total=8, connect=5, read=5,
        backoff_factor=1.7,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=("GET","POST"),
        raise_on_status=False,
        respect_retry_after_header=True,
    )
    adapter = HTTPAdapter(max_retries=retry, pool_connections=16, pool_maxsize=16)
    sess.mount("https://", adapter); sess.mount("http://", adapter)
    sess.headers.update({"User-Agent":"capstone-renewables/1.0"})
    return sess

HTTP = _make_session()

def _safe_json(r):
    ctype = r.headers.get("Content-Type","").lower()
    if "application/json" not in ctype:
        return None
    try:
        return r.json()
    except Exception:
        return None


In [None]:
# ====================================================
# Cell 5 — ERA5 fetcher (Open-Meteo), year chunks + CSV cache
# ====================================================
import time
import pandas as pd

ERA5_ENDPOINT = "https://archive-api.open-meteo.com/v1/era5"
ERA5_HOURLY   = [
    "temperature_2m","relative_humidity_2m","surface_pressure","cloudcover",
    "windspeed_10m","winddirection_10m","windspeed_100m","winddirection_100m",
    "shortwave_radiation","direct_radiation","diffuse_radiation","direct_normal_irradiance"
]

def _era5_cache_path(lat, lon, y_start, y_end):
    key = f"{lat:.4f}_{lon:.4f}_{y_start}_{y_end}.csv".replace(":","-")
    return CACHE / key

def _to_utc(s):
    return pd.to_datetime(s, utc=True)

def _hourly_index(start, end):
    return pd.date_range(start=_to_utc(start), end=_to_utc(end), freq="h", inclusive="left")

def fetch_era5(lat: float, lon: float, start: str, end: str) -> pd.DataFrame:
    start_d = pd.to_datetime(start).date()
    end_d   = pd.to_datetime(end).date()
    years = range(start_d.year, end_d.year + 1)
    frames = []

    for y in years:
        y_start = max(start_d, datetime.date(y,1,1))
        y_end   = min(end_d,   datetime.date(y,12,31))
        cp = _era5_cache_path(lat, lon, y_start, y_end)

        if cp.exists():
            dfx = pd.read_csv(cp, parse_dates=["timestamp"])
            dfx["timestamp"] = _to_utc(dfx["timestamp"])
            frames.append(dfx); continue

        params = {
            "latitude": lat, "longitude": lon,
            "start_date": str(y_start), "end_date": str(y_end),
            "hourly": ",".join(ERA5_HOURLY),
            "timezone": "UTC",
        }

        data = None
        for attempt in range(1, 7):
            log(f"GET ERA5 {y} {lat:.3f},{lon:.3f} try={attempt}")
            r = HTTP.get(ERA5_ENDPOINT, params=params, timeout=90)
            if r.status_code == 429:
                ra = r.headers.get("Retry-After")
                sleep_s = int(ra) if (ra and str(ra).isdigit()) else int(2.0 * attempt)
                log(f"ERA5 429 -> sleeping {sleep_s}s"); time.sleep(sleep_s); continue
            if r.status_code >= 500:
                time.sleep(2.0 * attempt); continue
            data = _safe_json(r)
            if data is None:
                time.sleep(1.5 * attempt); continue
            break

        if data is None:
            log(f"ERA5 {y} {lat:.3f},{lon:.3f}: non-JSON after retries -> skipping year")
            frames.append(pd.DataFrame(columns=["timestamp"]))
            continue

        ts = pd.to_datetime(data["hourly"]["time"], utc=True)
        dfy = pd.DataFrame({"timestamp": ts})
        for v in ERA5_HOURLY:
            if v in data["hourly"]:
                dfy[f"era5_{v}"] = pd.to_numeric(data["hourly"][v], errors="coerce")
        dfy = dfy.dropna(subset=["timestamp"]).drop_duplicates("timestamp").sort_values("timestamp")
        dfy.to_csv(cp, index=False)
        frames.append(dfy)
        time.sleep(0.4)  # polite gap between year chunks

    if not frames:
        return pd.DataFrame(columns=["timestamp"])
    out = pd.concat(frames, ignore_index=True)
    out = out.drop_duplicates("timestamp").sort_values("timestamp")
    return out


In [None]:
# ====================================================
# Cell 6 — Optional NSRDB fetch (US only; robust, proper params + diagnostics)
# ====================================================
import time
import pandas as pd

NSRDB_YEARS = [2022, 2023]  # keep in sync with your window

def fetch_nsrdb_range(lat: float, lon: float, years, api_key: str, email: str) -> pd.DataFrame:
    """
    Pull NSRDB PSM3 hourly data for 1–N years. Requires:
      - api_key: NREL_API_KEY
      - email:   NREL_EMAIL
    Uses global NREL_FULL_NAME / NREL_AFFILIATION / NREL_REASON if available.
    Returns a DataFrame with timestamp (UTC) and nsrdb_* columns, or empty DF if unavailable.
    """
    empty = pd.DataFrame(columns=[
        "timestamp","nsrdb_ghi","nsrdb_dni","nsrdb_dhi",
        "nsrdb_air_temperature","nsrdb_relative_humidity","nsrdb_wind_speed"
    ])

    if not api_key or not email:
        log("NSRDB: missing api_key or email -> skipping"); return empty

    # Clip years again here for safety (in case a caller passes future years)
    yrs = [y for y in (years or []) if NSRDB_MIN_YEAR <= int(y) <= NSRDB_MAX_YEAR]
    if not yrs:
        log(f"NSRDB: no supported years in requested window; "
            f"PSM3 supports {NSRDB_MIN_YEAR}-{NSRDB_MAX_YEAR}. Skipping.")
        return empty

    base = "https://developer.nrel.gov/api/nsrdb/v2/solar/psm3-download.csv"
    frames = []
    for y in yrs:
        params = {
            # NOTE: NSRDB wants lon first in WKT POINT(lon lat)
            "wkt": f"POINT({lon} {lat})",
            "names": str(y),
            "interval": "60",
            "utc": "true",
            "leap_day": "true",
            "attributes": "ghi,dni,dhi,air_temperature,relative_humidity,wind_speed",
            # REQUIRED identity / purpose fields
            "api_key": api_key,
            "email": email,
            "full_name": (NREL_FULL_NAME or "Capstone User"),
            "affiliation": (NREL_AFFILIATION or "Walsh College"),
            "reason": (NREL_REASON or "research"),   # <-- use 'reason' (not reason_for_use)
            "mailing_list": "false",                 # <-- some servers require explicit flag
        }

        log(f"GET NSRDB[{y}] @ {lat:.4f},{lon:.4f}")
        r = HTTP.get(base, params=params, timeout=120)

        # Helpful diagnostics on failure
        if r.status_code >= 400:
            log(f"NSRDB {y} HTTP {r.status_code}")
            try:
                # print full URL with params (trim if too long)
                log(f"NSRDB URL: {r.url[:300]}")
            except Exception:
                pass
            body = (r.text or "")[:800]
            log(f"NSRDB body (truncated): {body}")
            time.sleep(0.7)
            continue

        from io import StringIO
        raw = r.text
        lines = [ln for ln in raw.splitlines() if ln and not ln.startswith('#')]
        if not lines:
            log(f"NSRDB {y} empty body");
            continue

        csv = "\n".join(lines)
        dfi = pd.read_csv(StringIO(csv))

        # Build timestamp (UTC)
        if {"Year","Month","Day","Hour","Minute"}.issubset(dfi.columns):
            dfi["timestamp"] = pd.to_datetime(dict(
                year=dfi.Year, month=dfi.Month, day=dfi.Day, hour=dfi.Hour, minute=dfi.Minute
            ), utc=True)
        elif {"Year","Month","Day","Hour"}.issubset(dfi.columns):
            dfi["timestamp"] = pd.to_datetime(dict(
                year=dfi.Year, month=dfi.Month, day=dfi.Day, hour=dfi.Hour
            ), utc=True)
        else:
            log(f"NSRDB {y} unexpected header -> skipping")
            continue

        dfi = dfi.rename(columns={
            "GHI":"nsrdb_ghi","DNI":"nsrdb_dni","DHI":"nsrdb_dhi",
            "Air Temperature":"nsrdb_air_temperature",
            "Relative Humidity":"nsrdb_relative_humidity",
            "Wind Speed":"nsrdb_wind_speed"
        })

        frames.append(dfi[[
            "timestamp","nsrdb_ghi","nsrdb_dni","nsrdb_dhi",
            "nsrdb_air_temperature","nsrdb_relative_humidity","nsrdb_wind_speed"
        ]])

        time.sleep(0.5)  # polite gap per year

    if not frames:
        log("NSRDB returned no usable data.")
        return empty

    out = pd.concat(frames).drop_duplicates("timestamp").sort_values("timestamp")
    return out


In [None]:
def season_from_month(m: int) -> str:
    if m in (12,1,2):  return "winter"
    if m in (3,4,5):   return "spring"
    if m in (6,7,8):   return "summer"
    if m in (9,10,11): return "autumn"
    return "unknown"

def safe_first(df: pd.DataFrame, prefer: str, fallback: str):
    if prefer in df.columns and fallback in df.columns:
        return df[prefer].fillna(df[fallback])
    if prefer in df.columns:  return df[prefer]
    if fallback in df.columns:return df[fallback]
    return pd.Series(np.nan, index=df.index)

def col_or_nan(df: pd.DataFrame, name: str):
    return df[name] if name in df.columns else pd.Series(np.nan, index=df.index)

def pv_simple_estimate(wx: pd.DataFrame, capacity_kw_dc=1000.0):
    """
    Inputs: wx has 'timestamp', 'ghi' (W/m2), 'temp_air' (°C)
    Output columns: 'est_pv_power_kw_per_kwdc', 'target_solar_mw'
    """
    out = pd.DataFrame({"timestamp": pd.to_datetime(wx["timestamp"], utc=True)})
    ghi = wx["ghi"].clip(lower=0).fillna(0.0)
    taa = wx["temp_air"].fillna(15.0)
    NOCT = 45.0
    t_cell = taa + (NOCT - 20.0) / 800.0 * ghi
    gamma = -0.004
    derate = 0.85
    pv_per_kw = (ghi / 1000.0) * derate * (1.0 + gamma * (t_cell - 25.0))
    pv_per_kw = pv_per_kw.clip(lower=0.0)
    out["est_pv_power_kw_per_kwdc"] = pv_per_kw
    out["target_solar_mw"] = pv_per_kw * (capacity_kw_dc / 1000.0)
    return out

def estimate_wind_power(df: pd.DataFrame, ws_col="met_wind_speed_100m", capacity_mw=2.0):
    """
    Simple IEC-like curve to estimate normalized wind power and MW output.
    """
    v = df[ws_col].fillna(0.0)
    vci, vr, vco = 3.0, 12.0, 25.0
    def curve(x):
        if (x < vci) or (x >= vco): return 0.0
        if x >= vr:                 return 1.0
        return ((x - vci)/(vr - vci))**3
    out = df.copy()
    out["est_wind_power_norm"] = v.apply(curve)
    out["target_wind_mw"]      = capacity_mw * out["est_wind_power_norm"]
    return out


In [None]:
def hourly_index(start, end):
    return pd.date_range(start=pd.to_datetime(start, utc=True),
                         end=pd.to_datetime(end,   utc=True),
                         freq="h", inclusive="left")

def process_site(rec: dict, start=None, end=None, nrel_key=None):
    """
    rec must contain: 'site_id','lat','lon','country'
    Writes: out/<site_id>_merged.csv
    Returns: DataFrame
    """
    start = start or START
    end   = end   or END

    sid = rec.get("site_id")
    if not sid: raise ValueError(f"Missing site_id in record: {rec}")
    lat = float(rec["lat"]); lon = float(rec["lon"])
    country = rec.get("country")

    log(f"==== Processing {sid} ====")
    base_idx = hourly_index(start, end)
    df = pd.DataFrame({"timestamp": base_idx})

    # ---- ERA5 ----
    era5 = fetch_era5(lat, lon, start, end)
    if not era5.empty:
        era5 = era5.drop_duplicates(subset=["timestamp"]).sort_values("timestamp")
        df = df.merge(era5, on="timestamp", how="left")
    log(f"[{sid}] ERA5 rows: {len(era5)}")

    # ---- NSRDB (optional, US only) ----
    if (country == "US") and nrel_key:
        try:
            nsrdb = fetch_nsrdb_range(lat, lon, NSRDB_YEARS, api_key=nrel_key, email=NREL_EMAIL)
            if not nsrdb.empty:
                nsrdb["timestamp"] = pd.to_datetime(nsrdb["timestamp"], utc=True)
                df = df.merge(nsrdb, on="timestamp", how="left")
                log(f"[{sid}] NSRDB rows: {len(nsrdb)}")
            else:
                log(f"[{sid}] NSRDB: no data returned")
        except Exception as e:
            log(f"[{sid}] NSRDB skipped: {e}")

    # ---- IDs & static ----
    df["site_id"] = sid
    df["lat"]     = lat
    df["lon"]     = lon
    df["country"] = country

    # ---- Time features ----
    ts = pd.DatetimeIndex(df["timestamp"])
    df["year"]  = ts.year
    df["month"] = ts.month
    df["day"]   = ts.day
    df["hour"]  = ts.hour
    df["dow"]   = ts.weekday
    df["season"] = pd.Categorical([season_from_month(m) for m in df["month"]])

    # ---- Meteorology unify (prefer NSRDB when present) ----
    df["met_ghi"] = safe_first(df, "nsrdb_ghi", "era5_shortwave_radiation")
    df["met_dni"] = safe_first(df, "nsrdb_dni", "era5_direct_radiation")
    df["met_dhi"] = safe_first(df, "nsrdb_dhi", "era5_diffuse_radiation")

    df["met_temp_c"]       = col_or_nan(df, "era5_temperature_2m")
    df["met_rh_pct"]       = col_or_nan(df, "era5_relative_humidity_2m")
    df["met_pressure_hpa"] = col_or_nan(df, "era5_surface_pressure")
    df["met_cloud_cover"]  = col_or_nan(df, "era5_cloudcover")

    df["met_wind_speed_10m"]     = col_or_nan(df, "era5_windspeed_10m")
    df["met_wind_direction_10m"] = col_or_nan(df, "era5_winddirection_10m")

    if "era5_windspeed_100m" in df.columns:
        df["met_wind_speed_100m"] = df["era5_windspeed_100m"]
    else:
        alpha = 0.143
        df["met_wind_speed_100m"] = df["met_wind_speed_10m"] * (100.0/10.0) ** alpha
    df["met_wind_direction_100m"] = col_or_nan(df, "era5_winddirection_100m")

    # ---- Targets (wind) ----
    df = estimate_wind_power(df, ws_col="met_wind_speed_100m", capacity_mw=2.0)

    # ---- Targets (PV simple) ----
    pv_in = pd.DataFrame({
        "timestamp": df["timestamp"],
        "ghi":       df["met_ghi"],
        "temp_air":  df["met_temp_c"],
    })
    try:
        pv_out = pv_simple_estimate(pv_in, capacity_kw_dc=1000.0)
        df = df.merge(pv_out, on="timestamp", how="left")
    except Exception as e:
        log(f"[{sid}] PV estimate skipped: {e}")

    # ---- Finalize & write ----
    df = df.drop_duplicates(subset=["site_id","timestamp"]).sort_values(["site_id","timestamp"])
    out_path = OUT / f"{sid}_merged.csv"
    df.to_csv(out_path, index=False)
    log(f"[{sid}] wrote -> {out_path} rows={len(df)}")
    return df


In [None]:
# ====================================================
# Cell 9 — Region runner + checkpoint + per-region merge
# ====================================================
import time
import pandas as pd
from pathlib import Path

def _ckpt_path(region: str) -> Path:
    return CKPT / f"done_sites_{region}.json"

def load_done(region: str) -> set:
    p = _ckpt_path(region)
    if p.exists():
        try:
            return set(json.loads(p.read_text()))
        except Exception:
            return set()
    return set()

def save_done(region: str, done_set: set):
    _ckpt_path(region).write_text(json.dumps(sorted(list(done_set))))

def merge_region_outputs(region: str):
    region = region.upper()
    df_region = REGISTRY[REGISTRY["region_group"] == region].copy()
    ids = set(df_region["site_id"].tolist())
    files = [p for p in OUT.glob("*.csv") if p.stem.replace("_merged","") in ids]
    if not files:
        print(f"[merge] No per-site CSVs found for region {region}."); return
    frames = []
    for p in files:
        try:
            dfi = pd.read_csv(p, parse_dates=["timestamp"])
            frames.append(dfi)
        except Exception as e:
            print(f"[merge] skipped {p.name}: {e}")
    if not frames:
        print(f"[merge] All files unreadable for region {region}."); return
    merged = pd.concat(frames, ignore_index=True)
    merged = merged.drop_duplicates(subset=["site_id","timestamp"]).sort_values(["site_id","timestamp"])
    out_csv = Path(f"ALL_SITES_merged_{region}.csv")
    merged.to_csv(out_csv, index=False)
    print(f"[merge] Wrote {out_csv} shape={merged.shape}")

def process_region(region: str, start=None, end=None, nrel_key=None, per_site_sleep=PER_SITE_SLEEP_S, limit=None):
    """
    Process all sites in a region; writes per-site CSVs and ALL_SITES_merged_<REGION>.csv
    """
    region = region.upper()
    df_region = REGISTRY[REGISTRY["region_group"] == region].copy()
    if df_region.empty:
        print(f"[warn] No sites for region '{region}'"); return

    done = load_done(region)
    todo = [r for _, r in df_region.iterrows() if r["site_id"] not in done]
    if limit is not None:
        todo = todo[:int(limit)]

    print(f"=== REGION {region}: {len(todo)} to run, {len(done)} already done ===")

    for i, rec in enumerate(todo, 1):
        drec = dict(rec)
        sid = drec["site_id"]
        try:
            _ = process_site(drec, start=start or START, end=end or END, nrel_key=nrel_key or NREL_API_KEY)
            done.add(sid)
            save_done(region, done)
            time.sleep(per_site_sleep)
        except Exception as e:
            log(f"[{sid}] ERROR in {region}: {e}")

    merge_region_outputs(region)
    print(f"=== REGION {region}: COMPLETE ===")


In [None]:
# Run one region at a time to avoid timeouts/rate-limits:

# Europe
process_region("EUROPE", per_site_sleep=PER_SITE_SLEEP_S, limit=5)

# Americas (run after Europe completes)
# process_region("AMERICAS", per_site_sleep=PER_SITE_SLEEP_S, limit=5)

# Asia (run after Americas completes)
# process_region("ASIA", per_site_sleep=PER_SITE_SLEEP_S, limit=5)

# Australia/NZ (run last)
# process_region("AUSTRALIA", per_site_sleep=PER_SITE_SLEEP_S, limit=5)


=== REGION EUROPE: 0 to run, 54 already done ===
[merge] Wrote ALL_SITES_merged_EUROPE.csv shape=(946080, 38)
=== REGION EUROPE: COMPLETE ===


In [None]:
from pathlib import Path
import pandas as pd

OUT = Path("out")
site_files = sorted(OUT.glob("*_merged.csv"))
print(f"Per-site files found in {OUT.resolve()}: {len(site_files)}")
for p in site_files[:100]:
    print(" -", p.name)


Per-site files found in /content/out: 54
 - AT_Vienna_merged.csv
 - BE_Zeebrugge_merged.csv
 - BG_Kavarna_merged.csv
 - CZ_Prague_merged.csv
 - DE_Bayreuth_merged.csv
 - DE_Bremerhaven_merged.csv
 - DE_Cuxhaven_merged.csv
 - DE_Emden_merged.csv
 - DE_Freiburg_merged.csv
 - DE_Husum_merged.csv
 - DE_Ingolstadt_merged.csv
 - DE_Leipzig_merged.csv
 - DE_Sassnitz_Ruegen_merged.csv
 - DE_Wilhelmshaven_merged.csv
 - DK_Esbjerg_merged.csv
 - ES_Almeria_merged.csv
 - ES_Burgos_merged.csv
 - ES_Caceres_merged.csv
 - ES_Merida_merged.csv
 - ES_Pamplona_merged.csv
 - ES_Puertollano_merged.csv
 - ES_Seville_merged.csv
 - ES_Talayuela_merged.csv
 - ES_Valencia_merged.csv
 - ES_Zaragoza_merged.csv
 - FI_Oulu_merged.csv
 - FR_Bordeaux_merged.csv
 - FR_Cherbourg_merged.csv
 - FR_Dunkerque_merged.csv
 - FR_LeHavre_merged.csv
 - FR_Marseille_merged.csv
 - FR_Montpellier_merged.csv
 - FR_Narbonne_merged.csv
 - FR_Perpignan_merged.csv
 - FR_SaintNazaire_merged.csv
 - FR_Toulouse_merged.csv
 - GR_Kozani_me

In [None]:
from pathlib import Path
import shutil, os

drive_dir = Path("/content/drive/MyDrive/Colab Notebooks/renewables_outputs")
drive_dir.mkdir(parents=True, exist_ok=True)

# Copy region file
shutil.copy2("ALL_SITES_merged_EUROPE.csv", drive_dir / "ALL_SITES_merged_EUROPE.csv")

# Copy all per-site files (careful: 54 files)
for p in Path("out").glob("*.csv"):
    shutil.copy2(p, drive_dir / p.name)

print("Copied to:", drive_dir)


Copied to: /content/drive/MyDrive/Colab Notebooks/renewables_outputs
