<a href="https://colab.research.google.com/github/redaxe101/MastersThesisNotebook/blob/main/FetchWeatherForecasts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Weather Forecasts Fetch

Fetches historical weather forecasts for the capital cities every 12 hours from open-meteo.com

It then creates 16 hour forecast sequences every 30m period and saves this all as a CSV cache so that it doesn't have to keep fetching it from open-meteo.

Mark Sinclair, 2025

In [None]:
import pandas as pd
import os
import requests
from datetime import timedelta
from google.colab import drive
drive.mount('/content/drive')


def fetch_forecast_rolling_windows_every_30m(
    start_date,
    end_date,
    region,
    lat,
    lon,
    forecast_frequency_hours=12,
    output_frequency_minutes=30,
    forecast_horizon_hours=16,
):
    """
    Fetch historical forecasts every `forecast_frequency_hours`, and use each to create multiple
    forecast entries every `output_frequency_minutes` until the forecast horizon is reached.
    Produces rows every 30 minutes for training pipelines. Appends to and updates a CSV cache.
    """
    import pandas as pd
    import requests
    import os
    from datetime import datetime, timedelta

    cache_path=f"/content/drive/MyDrive/NEM/historical_forecast_cache_{region}.csv"
    start_dt = pd.to_datetime(start_date)
    end_dt = pd.to_datetime(end_date)

    if os.path.exists(cache_path):
        existing_df = pd.read_csv(cache_path, parse_dates=["RUN_DATETIME", "DATETIME"])
        existing_df.set_index(["RUN_DATETIME", "DATETIME"], inplace=True)
        print(f"📦 Loaded existing cache: {cache_path} ({len(existing_df)} rows)")
    else:
        existing_df = pd.DataFrame()
        print("📂 No existing cache found, starting fresh.")

    forecast_times = pd.date_range(start=start_dt, end=end_dt + timedelta(hours=12), freq="12h")
    output_times = pd.date_range(start=start_dt, end=end_dt, freq=f"{output_frequency_minutes}min")

    all_records = []

    for forecast_time in forecast_times:
        if not existing_df.empty and forecast_time in existing_df.index.get_level_values("RUN_DATETIME"):
            print(f"⏩ Skipping {forecast_time} (already cached)")
            continue

        print(f"📡 Fetching forecast issued at {forecast_time}")
        url = "https://historical-forecast-api.open-meteo.com/v1/forecast"
        params = {
            "latitude": lat,
            "longitude": lon,
            "start_date": forecast_time.strftime("%Y-%m-%d"),
            "end_date": (forecast_time + timedelta(days=2)).strftime("%Y-%m-%d"),
            "hourly": "temperature_2m,cloudcover,relative_humidity_2m,windspeed_10m",
            "forecast_hour": forecast_time.strftime("%H"),
            "timezone": "Australia/Brisbane"
        }

        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()
            df = pd.DataFrame(data["hourly"])
        except Exception as e:
            print(f"⚠️ Failed to fetch forecast for {forecast_time}: {e}")
            continue


        df["time"] = pd.to_datetime(df["time"])
        df = df[df["time"] < forecast_time + timedelta(hours=forecast_horizon_hours + 12)]
        df.set_index("time", inplace=True)

        df = df.infer_objects(copy=False)
        df = df.resample(f"{output_frequency_minutes}min").interpolate("linear").ffill().bfill()
        df.reset_index(inplace=True)

        # Align to forecast steps
        df["step"] = range(len(df))
        df["DATETIME"] = forecast_time + pd.to_timedelta(df["step"] * output_frequency_minutes, unit="m")
        df.drop(columns=["time", "step"], inplace=True)
        df.set_index("DATETIME", inplace=True)


        sub_run_times = pd.date_range(
            start=forecast_time,
            end=forecast_time + timedelta(hours=forecast_frequency_hours - output_frequency_minutes / 60),
            freq=f"{output_frequency_minutes}min"
        )
        for sub_run_time in sub_run_times:
            block_start = sub_run_time
            block_end = sub_run_time + timedelta(hours=forecast_horizon_hours)

            # Slice 30min forecasts covering this horizon
            df_block = df.loc[block_start:block_end - timedelta(minutes=1)].copy()

            # Reassign DATETIME to align with this RUN_DATETIME
            df_block.reset_index(drop=True, inplace=True)
            df_block["DATETIME"] = pd.date_range(
                start=sub_run_time,
                periods=len(df_block),
                freq=f"{output_frequency_minutes}min"
            )

            df_block["RUN_DATETIME"] = sub_run_time
            all_records.append(df_block)


    if all_records:
        new_df = pd.concat(all_records).set_index(["RUN_DATETIME", "DATETIME"])
        combined_df = pd.concat([existing_df, new_df])
        combined_df = combined_df[~combined_df.index.duplicated(keep="last")].sort_index()
        combined_df.to_csv(cache_path)
        print(f"✅ Updated {region} cache with {len(new_df)} new rows. Total rows: {len(combined_df)}")
        return combined_df.reset_index()
    else:
        print("✅ No new data fetched. Returning existing cache.")
        return existing_df.reset_index()

for region in [ "NSW1", "QLD1", "VIC1"]:

  match region:
    case "NSW1":
      lat = -33.8148
      lon = 151.0017
    case "QLD1":
      lat = -27.4705
      lon = 153.0251
    case "VIC1":
      lat = -37.8136
      lon = 144.9631
    case "TAS1":
      lat = -42.8829
      lon = 147.3272
    case "SA1":
      lat = -34.9285
      lon = 138.5999


  df_forecasts = fetch_forecast_rolling_windows_every_30m(
      start_date="2022-11-01",
      end_date="2023-09-03", lat=lat, lon=lon, region=region,
  )


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📂 No existing cache found, starting fresh.
📡 Fetching forecast issued at 2022-11-01 00:00:00
📡 Fetching forecast issued at 2022-11-01 12:00:00
📡 Fetching forecast issued at 2022-11-02 00:00:00
📡 Fetching forecast issued at 2022-11-02 12:00:00
📡 Fetching forecast issued at 2022-11-03 00:00:00
📡 Fetching forecast issued at 2022-11-03 12:00:00
📡 Fetching forecast issued at 2022-11-04 00:00:00
📡 Fetching forecast issued at 2022-11-04 12:00:00
📡 Fetching forecast issued at 2022-11-05 00:00:00
📡 Fetching forecast issued at 2022-11-05 12:00:00
📡 Fetching forecast issued at 2022-11-06 00:00:00
📡 Fetching forecast issued at 2022-11-06 12:00:00
📡 Fetching forecast issued at 2022-11-07 00:00:00
📡 Fetching forecast issued at 2022-11-07 12:00:00
📡 Fetching forecast issued at 2022-11-08 00:00:00
📡 Fetching forecast issued at 2022-11-08 12:00:00
📡 Fetching forecast issued a

KeyboardInterrupt: 