#Download AEMO Historical Data 

In [0]:


import requests
import pandas as pd
from io import StringIO
from datetime import datetime
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
from pyspark.sql.functions import *

spark.sql("USE energy_analytics")


base_url = "https://aemo.com.au/aemo/data/nem/priceanddemand"
region = "VIC1"
MONTHS = 36

end_date = datetime.now()
months = []
for i in range(MONTHS):
    month_date = end_date - relativedelta(months=i)
    months.append(month_date.strftime("%Y%m"))
months = sorted(list(set(months)))

all_data = []
for idx, month in enumerate(months, 1):
    try:
        url = f"{base_url}/PRICE_AND_DEMAND_{month}_{region}.csv"
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        df_csv = pd.read_csv(StringIO(response.text))
        all_data.append(df_csv)
        print(f"[{idx}/{len(months)}] {month}: {len(df_csv):,} records")
    except:
        print(f"[{idx}/{len(months)}] {month}: Failed")

combined = pd.concat(all_data, ignore_index=True)
spark.createDataFrame(combined).write.mode("overwrite").saveAsTable("bronze_aemo_extended_prices")

print(f"\n✅ {len(combined):,} records saved")
print(f"   {combined['SETTLEMENTDATE'].min()} to {combined['SETTLEMENTDATE'].max()}")




[1/36] 202303: 8,928 records
[2/36] 202304: 8,640 records
[3/36] 202305: 8,928 records
[4/36] 202306: 8,640 records
[5/36] 202307: 8,928 records
[6/36] 202308: 8,928 records
[7/36] 202309: 8,640 records
[8/36] 202310: 8,928 records
[9/36] 202311: 8,640 records
[10/36] 202312: 8,928 records
[11/36] 202401: 8,928 records
[12/36] 202402: 8,352 records
[13/36] 202403: 8,928 records
[14/36] 202404: 8,640 records
[15/36] 202405: 8,928 records
[16/36] 202406: 8,640 records
[17/36] 202407: 8,928 records
[18/36] 202408: 8,928 records
[19/36] 202409: 8,640 records
[20/36] 202410: 8,928 records
[21/36] 202411: 8,640 records
[22/36] 202412: 8,928 records
[23/36] 202501: 8,928 records
[24/36] 202502: 8,064 records
[25/36] 202503: 8,928 records
[26/36] 202504: 8,640 records
[27/36] 202505: 8,928 records
[28/36] 202506: 8,640 records
[29/36] 202507: 8,928 records
[30/36] 202508: 8,928 records
[31/36] 202509: 8,640 records
[32/36] 202510: 8,928 records
[33/36] 202511: 8,640 records
[34/36] 202512: 8,9

# Historical weather data

In [0]:

lat, lon = -37.8136, 144.9631


weather_data = []

for year_offset in range(3):
    start = (datetime.now() - relativedelta(years=year_offset+1)).strftime('%Y-%m-%d')
    end = (datetime.now() - relativedelta(years=year_offset)).strftime('%Y-%m-%d')
    
    url = f"https://archive-api.open-meteo.com/v1/archive"
    params = {
        'latitude': lat,
        'longitude': lon,
        'start_date': start,
        'end_date': end,
        'hourly': 'temperature_2m,relative_humidity_2m,precipitation,wind_speed_10m,wind_direction_10m,cloud_cover,surface_pressure,shortwave_radiation',
        'timezone': 'Australia/Melbourne'
    }
    
    response = requests.get(url, params=params, timeout=60)
    data = response.json()
    
    if 'hourly' in data:
        df_chunk = pd.DataFrame({
            'datetime': pd.to_datetime(data['hourly']['time']),
            'temperature': data['hourly']['temperature_2m'],
            'humidity': data['hourly']['relative_humidity_2m'],
            'precipitation': data['hourly']['precipitation'],
            'wind_speed': data['hourly']['wind_speed_10m'],
            'wind_direction': data['hourly']['wind_direction_10m'],
            'cloud_cover': data['hourly']['cloud_cover'],
            'pressure': data['hourly']['surface_pressure'],
            'solar_radiation': data['hourly']['shortwave_radiation']
        })
        weather_data.append(df_chunk)
        print(f"Year {year_offset+1}: {len(df_chunk):,} records")

weather_combined = pd.concat(weather_data, ignore_index=True).sort_values('datetime').reset_index(drop=True)
spark.createDataFrame(weather_combined).write.mode("overwrite").saveAsTable("bronze_weather_extended_melbourne")

print(f"\n✅ {len(weather_combined):,} records saved")
print(f"   {weather_combined['datetime'].min()} to {weather_combined['datetime'].max()}")

Year 1: 8,784 records
Year 2: 8,808 records
Year 3: 8,784 records

✅ 26,376 records saved
   2023-02-21 00:00:00 to 2026-02-21 23:00:00


#Historical Generation Data

In [0]:





API_KEY = "oe_4o71XjvYkL2qgk5fsKYCxW"
BASE_URL = "https://api.openelectricity.org.au/v4"
HEADERS = {'Authorization': f'Bearer {API_KEY}'}
TARGET_REGION = "VIC1"
NETWORK = "NEM"



def generate_chunks(start_str, end_str, chunk_days=30):
    start = datetime.fromisoformat(start_str)
    end   = datetime.fromisoformat(end_str)
    chunks = []
    current = start
    while current < end:
        candidate = current + timedelta(days=chunk_days)
        chunk_end = candidate if candidate < end else end
        chunks.append((current.isoformat(), chunk_end.isoformat()))
        current = chunk_end + timedelta(seconds=1)
    return chunks



def parse_response(data, target_region=None):

    rows = []
    for block in data.get('data', []):
        metric   = block.get('metric')
        unit     = block.get('unit')
        interval = block.get('interval')

        for result in block.get('results', []):
            columns = result.get('columns', {})
            region      = columns.get('region')
            fueltech    = columns.get('fueltech_group')

        
            if target_region and region != target_region:
                continue

            for point in result.get('data', []):
                if point and len(point) == 2:
                    rows.append({
                        'timestamp':     point[0],
                        'energy_mwh':    point[1],
                        'network_region': region,
                        'fueltech_group': fueltech,
                        'metric':        metric,
                        'unit':          unit,
                        'interval':      interval,
                    })
    return rows



full_ranges = [
    ("2023-03-01T00:00:00", "2023-12-31T23:59:59"),
    ("2024-01-01T00:00:00", "2024-12-31T23:59:59"),
    ("2025-01-01T00:00:00", "2025-12-31T23:59:59"),
    ("2026-01-01T00:00:00", "2026-02-21T23:59:59"),
]

all_chunks = []
for start, end in full_ranges:
    all_chunks.extend(generate_chunks(start, end, chunk_days=30))

print(f"Total chunks to fetch: {len(all_chunks)}")



all_rows = []
errors   = []

for i, (start, end) in enumerate(all_chunks):
    params = {
        'metrics':            'energy',
        'date_start':         start,
        'date_end':           end,
        'interval':           '1h',
        'primary_grouping':   'network_region',
        'secondary_grouping': 'fueltech_group',
    }

    print(f"[{i+1:02d}/{len(all_chunks)}] {start[:10]} → {end[:10]}...", end=" ")

    try:
        response = requests.get(
            f"{BASE_URL}/data/network/{NETWORK}",
            params=params,
            headers=HEADERS,
            timeout=60
        )

        if response.status_code == 200:
            data = response.json()
            rows = parse_response(data, target_region=TARGET_REGION)
            all_rows.extend(rows)
            print(f"✅ {len(rows):,} rows")
        else:
            msg = f"HTTP {response.status_code}: {response.text[:200]}"
            print(f"❌ {msg}")
            errors.append((start, end, msg))

    except Exception as e:
        msg = str(e)
        print(f"❌ Exception: {msg}")
        errors.append((start, end, msg))


print("\n" + "=" * 60)
print(f"✅ Total rows collected : {len(all_rows):,}")
print(f"❌ Chunks failed        : {len(errors)}")

if errors:
    print("\nFailed chunks:")
    for s, e, msg in errors:
        print(f"  {s[:10]} → {e[:10]} | {msg}")



if all_rows:
    df_gen = pd.DataFrame(all_rows)


    df_gen['timestamp'] = pd.to_datetime(df_gen['timestamp'], utc=True)
    df_gen = df_gen.sort_values(['timestamp', 'fueltech_group']).reset_index(drop=True)


    df_gen = df_gen.drop_duplicates(subset=['timestamp', 'network_region', 'fueltech_group'])

    print(f"Total rows    : {len(df_gen):,}")
    print(f"Date range    : {df_gen['timestamp'].min()} → {df_gen['timestamp'].max()}")
    print(f"Columns       : {list(df_gen.columns)}")
    print(f"Fueltechs     : {sorted(df_gen['fueltech_group'].unique().tolist())}")
    print(f"Regions       : {df_gen['network_region'].unique().tolist()}")

    display(df_gen.head(20))

   
    spark.createDataFrame(df_gen).write.mode("overwrite").saveAsTable("bronze_generation_extended")
    print(f"\n✅ Saved {len(df_gen):,} rows to bronze_generation_extended")

else:
    print("\n❌ No data collected — check errors above")

Total chunks to fetch: 39
[01/39] 2023-03-01 → 2023-03-31... ✅ 5,040 rows
[02/39] 2023-03-31 → 2023-04-30... ✅ 5,040 rows
[03/39] 2023-04-30 → 2023-05-30... ✅ 5,040 rows
[04/39] 2023-05-30 → 2023-06-29... ✅ 5,040 rows
[05/39] 2023-06-29 → 2023-07-29... ✅ 5,040 rows
[06/39] 2023-07-29 → 2023-08-28... ✅ 5,040 rows
[07/39] 2023-08-28 → 2023-09-27... ✅ 5,040 rows
[08/39] 2023-09-27 → 2023-10-27... ✅ 5,034 rows
[09/39] 2023-10-27 → 2023-11-26... ✅ 5,040 rows
[10/39] 2023-11-26 → 2023-12-26... ✅ 5,040 rows
[11/39] 2023-12-26 → 2023-12-31... ✅ 1,001 rows
[12/39] 2024-01-01 → 2024-01-31... ✅ 5,040 rows
[13/39] 2024-01-31 → 2024-03-01... ✅ 5,040 rows
[14/39] 2024-03-01 → 2024-03-31... ✅ 5,040 rows
[15/39] 2024-03-31 → 2024-04-30... ✅ 5,040 rows
[16/39] 2024-04-30 → 2024-05-30... ✅ 5,040 rows
[17/39] 2024-05-30 → 2024-06-29... ✅ 5,040 rows
[18/39] 2024-06-29 → 2024-07-29... ✅ 5,040 rows
[19/39] 2024-07-29 → 2024-08-28... ✅ 5,517 rows
[20/39] 2024-08-28 → 2024-09-27... ✅ 5,760 rows
[21/39] 2024-0

timestamp,energy_mwh,network_region,fueltech_group,metric,unit,interval
2023-02-28T14:00:00.000Z,7.1619,VIC1,battery_charging,energy,MWh,1h
2023-02-28T14:00:00.000Z,2.4402,VIC1,battery_discharging,energy,MWh,1h
2023-02-28T14:00:00.000Z,4172.2929,VIC1,coal,energy,MWh,1h
2023-02-28T14:00:00.000Z,0.0,VIC1,gas,energy,MWh,1h
2023-02-28T14:00:00.000Z,187.5423,VIC1,hydro,energy,MWh,1h
2023-02-28T14:00:00.000Z,0.0,VIC1,solar,energy,MWh,1h
2023-02-28T14:00:00.000Z,1131.938,VIC1,wind,energy,MWh,1h
2023-02-28T15:00:00.000Z,5.1137,VIC1,battery_charging,energy,MWh,1h
2023-02-28T15:00:00.000Z,3.0257,VIC1,battery_discharging,energy,MWh,1h
2023-02-28T15:00:00.000Z,4164.0027,VIC1,coal,energy,MWh,1h



✅ Saved 196,368 rows to bronze_generation_extended


#Aggregate Historical Generation to Match Live Structure

In [0]:

df_gen = spark.table("energy_analytics.bronze_generation_extended")

print("Columns:")
print(df_gen.columns)

print("\nFuel types:")
df_gen.select("fueltech_group").distinct().show()

print("\nSample data:")
df_gen.show(10)



from pyspark.sql.functions import *

df_agg = df_gen \
    .withColumn("datetime", to_timestamp(col("timestamp"))) \
    .withColumn("hour", date_trunc("hour", col("datetime"))) \
    .groupBy("hour", "network_region") \
    .agg(
        round(sum(when(col("fueltech_group").isin(["coal", "gas", "hydro"]), col("energy_mwh")).otherwise(0)), 2).alias("scheduled_generation"),
        round(sum(when(col("fueltech_group").isin(["wind", "solar"]), col("energy_mwh")).otherwise(0)), 2).alias("semischeduled_generation"),
        round(sum(when(col("fueltech_group") == "coal", col("energy_mwh")).otherwise(0)), 2).alias("coal"),
        round(sum(when(col("fueltech_group") == "gas", col("energy_mwh")).otherwise(0)), 2).alias("gas"),
        round(sum(when(col("fueltech_group") == "hydro", col("energy_mwh")).otherwise(0)), 2).alias("hydro"),
        round(sum(when(col("fueltech_group") == "wind", col("energy_mwh")).otherwise(0)), 2).alias("wind"),
        round(sum(when(col("fueltech_group") == "solar", col("energy_mwh")).otherwise(0)), 2).alias("solar")
    ) \
    .withColumn("total_generation", col("scheduled_generation") + col("semischeduled_generation")) \
    .withColumn("renewable_pct", 
                when(col("total_generation") > 0, 
                     round(col("semischeduled_generation") / col("total_generation") * 100, 2))
                .otherwise(0))

df_agg.write.mode("overwrite").saveAsTable("bronze_generation_hourly")

print(f"✅ {df_agg.count():,} hourly records")
df_agg.show(10)

Columns:
['timestamp', 'energy_mwh', 'network_region', 'fueltech_group', 'metric', 'unit', 'interval']

Fuel types:
+-------------------+
|     fueltech_group|
+-------------------+
|              hydro|
|            battery|
|               coal|
|                gas|
|              solar|
|   battery_charging|
|battery_discharging|
|               wind|
+-------------------+


Sample data:
+-------------------+----------+--------------+-------------------+------+----+--------+
|          timestamp|energy_mwh|network_region|     fueltech_group|metric|unit|interval|
+-------------------+----------+--------------+-------------------+------+----+--------+
|2024-09-27 20:00:00|   -4.5846|          VIC1|            battery|energy| MWh|      1h|
|2024-09-27 20:00:00|   13.0099|          VIC1|   battery_charging|energy| MWh|      1h|
|2024-09-27 20:00:00|    0.0337|          VIC1|battery_discharging|energy| MWh|      1h|
|2024-09-27 20:00:00|  2464.666|          VIC1|               coal|ener