<a href="https://colab.research.google.com/github/simulate111/Climatic_Data/blob/main/NOAA_Global_Forecast_System_(GFS)_Amazon_Web_Services_(AWS)_S3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install herbie-data
# You also need these for GFS GRIB2 data processing
!pip install xarray cfgrib

Collecting herbie-data
  Downloading herbie_data-2025.12.0-py3-none-any.whl.metadata (17 kB)
Collecting cfgrib>=0.9.15 (from herbie-data)
  Downloading cfgrib-0.9.15.1-py3-none-any.whl.metadata (56 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/56.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m56.1/56.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting eccodes>=2.37.0 (from herbie-data)
  Downloading eccodes-2.44.0-py3-none-any.whl.metadata (15 kB)
Collecting findlibs (from eccodes>=2.37.0->herbie-data)
  Downloading findlibs-0.1.2-py3-none-any.whl.metadata (4.5 kB)
Collecting eccodeslib (from eccodes>=2.37.0->herbie-data)
  Downloading eccodeslib-2.44.1.8-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.2 kB)
Collecting eckitlib==

In [12]:
!pip install tqdm



In [None]:
import pandas as pd
import numpy as np
import xarray as xr
import warnings
import shutil
from herbie import Herbie
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # Progress bar library

# Silence the logs
warnings.filterwarnings("ignore")

cities = {
    "Turku": {"lat": 60.45, "lon": 22.26},
    "Copenhagen": {"lat": 55.67, "lon": 12.56},
    "Stockholm": {"lat": 59.32, "lon": 18.06},
    "Oslo": {"lat": 59.91, "lon": 10.75}
}

MAX_WORKERS = 30

def fetch_hour(timestamp_tuple):
    date_str, fxx = timestamp_tuple
    try:
        H = Herbie(date_str, model="gfs", product="pgrb2.0p25", fxx=fxx, verbose=False)
        search_str = ":TMP:2 m above ground|:UGRD:10 m above ground|:VGRD:10 m above ground|:DSWRF:surface"
        ds_list = H.xarray(search_str)

        ds = xr.merge(ds_list, compat='override') if isinstance(ds_list, list) else ds_list
        solar_var = 'sdswrf' if 'sdswrf' in ds.data_vars else 'dswrf'

        hour_results = []
        for name, coords in cities.items():
            lon_360 = coords['lon'] if coords['lon'] >= 0 else 360 + coords['lon']
            p = ds.sel(latitude=coords['lat'], longitude=lon_360, method="nearest")

            u, v = float(p['u10'].values), float(p['v10'].values)
            hour_results.append({
                "City": name,
                "Timestamp": p.valid_time.values,
                "GHI_Wm2": round(float(p[solar_var].values), 2),
                "Temp_C": round(float(p['t2m'].values) - 273.15, 2),
                "Wind_ms": round(np.sqrt(u**2 + v**2), 2)
            })

        # Cleanup temp files
        shutil.rmtree(H.get_local_path().parent, ignore_errors=True)
        return hour_results
    except Exception:
        return None

if __name__ == "__main__":
    tasks = []
    start_date = datetime(2024, 1, 1)
    for day in range(366):
        d_str = (start_date + timedelta(days=day)).strftime("%Y-%m-%d 00:00")
        for f in range(1, 25):
            tasks.append((d_str, f))

    print(f"üî• Starting MAX SPEED extraction (30 workers)")

    # Wrap the executor in tqdm to monitor progress
    all_extracted_data = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # We use list(tqdm(executor.map(...))) to trigger the bar
        results = list(tqdm(executor.map(fetch_hour, tasks), total=len(tasks), desc="Downloading 2024 GFS"))

    print("\n‚úÖ Download complete. Flattening and saving files...")

    flat_results = [item for sublist in results if sublist for item in sublist]
    df_main = pd.DataFrame(flat_results)

    for name in cities:
        city_df = df_main[df_main['City'] == name].sort_values('Timestamp')
        city_df.to_csv(f"{name.lower()}_gfs_2024.csv", index=False)
        print(f"üíæ Saved: {name.lower()}_gfs_2024.csv")

üî• Starting MAX SPEED extraction (30 workers)


Downloading 2024 GFS:   2%|‚ñè         | 194/8784 [01:30<1:02:58,  2.27it/s]