#Notebook Parameters & Mode Selection


In [0]:

dbutils.widgets.removeAll()  
dbutils.widgets.dropdown("mode", "collect", ["collect", "historical", "all"], "Execution Mode")

mode = dbutils.widgets.get("mode")
print(f"🚀 Running in mode: {mode}")

🚀 Running in mode: collect


#Create database and imports

In [0]:
import requests
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, DateType
from pyspark.sql.functions import (
    current_timestamp, 
    lit, 
    col, 
    to_date, 
    from_utc_timestamp,  
    md5, 
    concat_ws,
    min, max, avg, round as spark_round  
)
spark.sql("CREATE DATABASE IF NOT EXISTS energy_analytics")
spark.sql("USE energy_analytics")



DataFrame[]

#  AEMO Live Prices - Bronze Layer

In [0]:


def collect_aemo_data():
    """Collect AEMO prices and generation every 30 minutes"""
    import requests
    from datetime import datetime
    from pyspark.sql.functions import col, from_utc_timestamp, md5, concat_ws
    
    ingestion_time = datetime.now()
    print(f"⚡ AEMO collection at {ingestion_time.strftime('%H:%M:%S')}")
    
    response = requests.get("https://visualisations.aemo.com.au/aemo/apps/api/report/ELEC_NEM_SUMMARY", timeout=30)
    data = response.json()
    
    # Prices
    prices = [{'region': r.get('REGIONID'), 'price': float(r.get('RRP', 0)), 
               'raise_reg': float(r.get('RAISEREGRRP', 0)), 'lower_reg': float(r.get('LOWERREGRRP', 0)),
               'ingestion_ts': ingestion_time, 'source': 'AEMO_LIVE_API', 'settlement_date': ingestion_time.date()}
              for r in data['ELEC_NEM_SUMMARY_PRICES']]
    
    df_prices = spark.createDataFrame(prices) \
        .withColumn("settlement_datetime_aest", from_utc_timestamp(col("ingestion_ts"), "Australia/Melbourne")) \
        .withColumn("record_hash", md5(concat_ws("|", col("region"), col("ingestion_ts"), col("price"))))
    
    df_prices.write.format("delta").mode("append").partitionBy("settlement_date", "region") \
        .saveAsTable("energy_analytics.bronze_aemo_live_prices")
    
    # Generation
    generation = [{'region': r.get('REGIONID'), 'total_demand': float(r.get('TOTALDEMAND', 0)),
                   'scheduled_generation': float(r.get('SCHEDULEDGENERATION', 0)),
                   'semischeduled_generation': float(r.get('SEMISCHEDULEDGENERATION', 0)),
                   'net_interchange': float(r.get('NETINTERCHANGE', 0)),
                   'ingestion_ts': ingestion_time, 'source': 'AEMO_LIVE_API', 'settlement_date': ingestion_time.date()}
                  for r in data['ELEC_NEM_SUMMARY']]
    
    df_gen = spark.createDataFrame(generation) \
        .withColumn("settlement_datetime_aest", from_utc_timestamp(col("ingestion_ts"), "Australia/Melbourne")) \
        .withColumn("record_hash", md5(concat_ws("|", col("region"), col("ingestion_ts"), col("total_demand"))))
    
    df_gen.write.format("delta").mode("append").partitionBy("settlement_date", "region") \
        .saveAsTable("energy_analytics.bronze_aemo_generation")
    
    print(f"✅ Saved {len(prices)} prices, {len(generation)} generation records")


if mode in ["collect", "all"]:
    collect_aemo_data()
else:
    print("⏭️  Skipping AEMO collection (mode={})".format(mode))

⚡ AEMO collection at 03:27:45
✅ Saved 5 prices, 5 generation records


# BOM Weather Data - Bronze Layer

In [0]:

def collect_weather_data():
    """Collect BOM weather data every 30 minutes"""
    import requests
    from datetime import datetime
    from pyspark.sql.functions import col, from_utc_timestamp, md5, concat_ws
    
    ingestion_time = datetime.now()
    print(f"🌤️  Weather collection at {ingestion_time.strftime('%H:%M:%S')}")
    
    weather_data = []
    for station_id, product_id, station_name in [("95866", "IDV60801", "Melbourne Airport"), 
                                                   ("94866", "IDV60801", "Laverton")]:
        try:
            url = f"http://reg.bom.gov.au/fwo/{product_id}/{product_id}.{station_id}.json"
            response = requests.get(url, timeout=30)
            latest = response.json()['observations']['data'][0]
            
            weather_data.append({
                'station_id': station_id, 'station_name': station_name,
                'air_temp': latest.get('air_temp'), 'apparent_temp': latest.get('apparent_t'),
                'relative_humidity': latest.get('rel_hum'), 'wind_speed_kmh': latest.get('wind_spd_kmh'),
                'observation_time': latest.get('aifstime_utc'),
                'ingestion_ts': ingestion_time, 'source': 'BOM_API', 'settlement_date': ingestion_time.date()
            })
        except Exception as e:
            print(f"⚠️  {station_name}: {e}")
    
    if weather_data:
        df = spark.createDataFrame(weather_data) \
            .withColumn("observation_datetime_aest", from_utc_timestamp(col("ingestion_ts"), "Australia/Melbourne")) \
            .withColumn("record_hash", md5(concat_ws("|", col("station_id"), col("observation_time"))))
        
        df.write.format("delta").mode("append").partitionBy("settlement_date", "station_id") \
            .saveAsTable("energy_analytics.bronze_bom_weather")
        
        print(f"✅ Saved {len(weather_data)} weather records")


if mode in ["collect", "all"]:
    collect_weather_data()
else:
    print("⏭️  Skipping weather collection (mode={})".format(mode))

🌤️  Weather collection at 03:27:55
✅ Saved 2 weather records
