# <b> <span style="color:white">Electricity Sector Data Streaming & Analysis</span></b>


# <b> <span style="color:white">GROUP 04</span></b>


| Name                   | SID       | Unikey   |
| ---------------------- | --------- | -------- |
| Putu Eka Udiyani Putri | 550067302 | pput0940 |
| Rengga Firmandika      | 550126632 | rfir0117 |
| Vincentius Ansel Suppa | 550206406 | vsup0468 |


## <b> <span style="color:orange">0. Configuration and Import Required Libraries</span></b>


**Quick start:**
1. Project structure:
   
   <pre>
   Assignment2_Tut07_G04/
   ├── Assignment_2.ipynb      # main notebook
   └── requirements.txt        # list of required libraries to run the notebook
   </pre>

   Ensure your working directory is writable.

2. Create venv & install exact dependencies<br/>
   `python -m venv .venv`<br/>
   Windows: `.\.venv\Scripts\activate` | macOS/Linux: `source .venv/bin/activate`<br/>
   `python -m pip install --upgrade pip`<br/>
   `pip install -r requirements.txt`

3. Copy `.env.template` to `.env` file, replace `your_api_key` with your actual API key. 

4. Run the full pipeline<br/>


Import all the required libraries first.


In [1]:
from dotenv import load_dotenv
from rapidfuzz import fuzz, process
from collections import Counter
from datetime import datetime, timedelta
from pathlib import Path

import os
import requests
import pandas as pd
import time
import json
import math
import glob
import paho.mqtt.client as mqtt
import sys
import duckdb
import importlib 
import re
import numpy as np


In [2]:
# Minimal DuckDB connection helpers
_DUCK = None

def _healthy(conn) -> bool:
    try:
        conn.execute("SELECT 1")
        return True
    except Exception:
        return False

def get_duck(db_path: str = "energy_dw.duckdb", read_only: bool = True):
    """Return a reusable DuckDB connection. Creates it if missing or broken."""
    global _DUCK
    if _DUCK is not None and _healthy(_DUCK):
        return _DUCK
    _DUCK = duckdb.connect(db_path, read_only=read_only)
    return _DUCK

def close_duck():
    """Close and reset the global connection."""
    global _DUCK
    if _DUCK is not None:
        try:
            _DUCK.close()
        except Exception:
            pass
        _DUCK = None

## <b> <span style="color:orange">1. Data Retrieval</span></b>


In this project, we integrate facility data from Assignment 1 with the time series data from OpenElectricity. First, we will load the Assignment 1 facility data. 

We will do this with this workflow:
1. Get all operating facilities in NEM region from OpenElectricity.
2. Get all operating facilities in NEM region from Assignment 1 (NGER data).
3. Match the facilities from both sources and get all the required information for the next steps. 
4. Get the per facility time series power and emissions data from OpenElectricity (using facility code we have acquired from previous steps).  
5. Get the per region time series market price and demand.
6. Consolidate all retrieved data in one csv file.

First, we define some helper functions for OpenElectricity API calls.

In [3]:
# basic configs
API_KEY = os.getenv("OPENELECTRICITY_API_KEY")
API_KEY = API_KEY.strip().strip('"').strip("'")  
BASE_URL = "https://api.openelectricity.org.au/v4/"
HEADERS = {
        "Authorization": f"Bearer {API_KEY}",
        "Accept": "application/json",
    }
MARKET_PATH = "market/network"


# function to fetch data from facility endpoint
def fetch_data_from_API(endpoint: str, query_params: dict): 
    try:
        response = requests.get(f"{BASE_URL}{endpoint}", headers=HEADERS, params=query_params)
        
        print(f"Response status: {response.status_code}")
        print(f"Response url: {response.url}")
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"API Error {response.status_code}: {response.text}")
            print(f"Response headers: {dict(response.headers)}")

            try:
                error_json = response.json()
                print(f"Error details: {error_json}")
            except:
                print("Could not parse error response as JSON")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# function to fetch data from market endpoint
def fetch_market_data(network="NEM",
                      start_utc=None,
                      end_utc=None,
                      metrics=("price", "demand_energy"),
                      interval="5m",
                      primary_grouping="network_region"):
    """
    Flatten skema OpenElectricity (observed):
    data[list] -> item -> results[list] -> {
        name: 'price_NSW1' / 'demand_energy_NSW1',
        columns: {'region': 'NSW1'},
        data: [[timestamp, value], ...]
    }
    Output: timestamp, network_region, metric, value
    """
    endpoint = f"{MARKET_PATH}/{network}"

    params = []
    for m in metrics:
        params.append(("metrics", m))
    params += [
        ("interval", interval),
        ("date_start", start_utc),
        ("date_end", end_utc),
        ("primary_grouping", primary_grouping),
        ("with_clerk", "false"),
    ]
    params = [(k, v) for (k, v) in params if v is not None]

    payload = fetch_data_from_API(endpoint=endpoint, query_params=params)
    if payload is None:
        return pd.DataFrame()

    items = payload.get("data", [])
    rows = []

    for item in items:
        metric_name = item.get("metric")
        for res in item.get("results", []):
            region = None
            cols_meta = res.get("columns", {})
            if isinstance(cols_meta, dict):
                # contoh: {'region': 'NSW1'}
                region = cols_meta.get("region", region)

            for row in res.get("data", []):
                if not row:
                    continue
                ts = row[0]
                val = row[1] if len(row) > 1 else None
                rows.append({
                    "timestamp": ts,
                    "network_region": region,
                    "metric": metric_name,
                    "value": val
                })

    df = pd.DataFrame(rows)

    # Normalisation & sort
    if "timestamp" in df.columns:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)

    df = df.sort_values([c for c in ["timestamp", "network_region"] if c in df.columns])
    return df.reset_index(drop=True)

# helper function to save dataset
def save_dataset(df: pd.DataFrame, out_csv_path: str):
	out_path = Path(out_csv_path)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	df.to_csv(out_path, index=False)
	print(f"Saved: {out_path}")

### <b> <span style="color:pink">1.1 Get All Facilities in NEM Region from OpenElectricity API Facility Endpoint</span></b>


To get all operating facilities in NEM region, we will use the facilities endpoint from Open Electricity. This is an important step because from this endpoint, we will get the facility code from all facilities, which will be used to get the time series data from the facility data endpoint.

In [4]:
# set endpoint and params
ENDPOINT = "facilities/"
PARAMS = {
    'network_id': 'NEM',
    'status_id': 'operating'
}

# fetch facilities data
facilities = fetch_data_from_API(endpoint=ENDPOINT, query_params=PARAMS)
facilities_df = pd.json_normalize(facilities['data'])
facilities_df.head()

Response status: 200
Response url: https://api.openelectricity.org.au/v4/facilities/?network_id=NEM&status_id=operating


Unnamed: 0,code,name,network_id,network_region,description,units,updated_at,created_at,location.lat,location.lng
0,ADP,Adelaide Desalination,NEM,SA1,"<p>The Adelaide Desalination plant (ADP), form...","[{'code': 'ADPPV1', 'fueltech_id': 'solar_util...",2025-08-05T06:08:12Z,2023-10-18T04:34:30Z,-35.096948,138.484061
1,ALDGASF,Aldoga,NEM,QLD1,<p>The Aldoga Solar Farm will be approximately...,"[{'code': 'ALDGASF1', 'fueltech_id': 'solar_ut...",2025-03-25T00:52:44Z,2025-01-31T04:19:33Z,-23.839544,151.0849
2,ANGASTON,Angaston,NEM,SA1,<p>Angaston Power Station is a diesel-powered ...,"[{'code': 'ANGAST1', 'fueltech_id': 'distillat...",2025-09-07T01:53:13Z,2023-10-18T04:34:32Z,-34.503948,139.024296
3,APPIN,Appin,NEM,NSW1,"<p>In a world first, EDL developed the largest...","[{'code': 'APPIN', 'fueltech_id': 'gas_wcmg', ...",2025-09-07T01:53:15Z,2023-10-18T04:34:32Z,-34.210868,150.792711
4,ARWF,Ararat,NEM,VIC1,<p>Ararat Wind Farm is wind farm in western Vi...,"[{'code': 'ARWF1', 'fueltech_id': 'wind', 'sta...",2025-07-08T03:42:06Z,2023-10-18T04:34:32Z,-37.263393,143.082116


Some facilities have more than one units, so we need to make separate tables for easier analysis.

In [5]:
# function to separate the facilities and units rows
def build_tables(facilities: list[dict]):
    facility_rows: list[dict] = []
    unit_rows: list[dict] = []

    for f in facilities:
        f_code = f.get("code")
        facility_rows.append({
            "facility_code": f_code,
            "facility_name": f.get("name"),
            "network_id": f.get("network_id"),
            "network_region": f.get("network_region"),
            "lat": (f.get("location") or {}).get("lat"),
            "lng": (f.get("location") or {}).get("lng"),
            "created_at": f.get("created_at"),
            "updated_at": f.get("updated_at"),
        })

        for u in (f.get("units") or []):
                # unify field names we care about
                unit_rows.append({
                    "unit_code": u.get("code"),
                    "facility_code": f_code,
                    "fueltech_id": u.get("fueltech_id"),
                    "status_id": u.get("status_id"),
                    "dispatch_type": u.get("dispatch_type"),
                    "capacity_registered": u.get("capacity_registered"),
                    "capacity_maximum": u.get("capacity_maximum"),
                    "capacity_storage": u.get("capacity_storage"),
                    "data_first_seen": u.get("data_first_seen"),
                    "data_last_seen": u.get("data_last_seen"),
                    "unit_created_at": u.get("created_at"),
                    "unit_updated_at": u.get("updated_at"),
                })
    
    facilities_df = pd.DataFrame(facility_rows).drop_duplicates(subset=["facility_code"]).reset_index(drop=True)
    units_lookup_df = pd.DataFrame(unit_rows).drop_duplicates(subset=["unit_code"]).reset_index(drop=True)

    return facilities_df, units_lookup_df

In [6]:
facilities_df, units_facilities_df = build_tables(facilities['data'])

# save to csv
save_dataset(facilities_df, "DATA/EXTRACTED/electricity_facilities.csv")
save_dataset(units_facilities_df, "DATA/EXTRACTED/electricity_units_facilities.csv")

Saved: DATA\EXTRACTED\electricity_facilities.csv
Saved: DATA\EXTRACTED\electricity_units_facilities.csv


### <b> <span style="color:pink">1.2 Get All Facilities in NEM Region from NGER</span></b>


After getting all the operating facilities data in NEM region, we will start matching these data with the assignment 1 facility data we have just extracted earlier.

In [7]:
# cleanup just for once
try:
    con.close()
except Exception:
    pass

DB_PATH = "energy_dw.duckdb"
con = duckdb.connect(DB_PATH, read_only=True)

In [8]:
# Pull both sources (from DB and from API)
query = """
            SELECT DISTINCT
                f.facility_id as facility_code,
                dfa.facility_name,
                dfu.fuel_type,
                dfu.fuel_category,
                dfu.is_renewable,
                dge.latitude,
                dge.longitude,
                dge.state_code AS region,
                dge.geo_resolution
            FROM fact_nger_facility f
            JOIN dim_facility dfa ON dfa.facility_id=f.facility_id
            JOIN dim_fuel dfu ON dfu.fuel_key=f.fuel_key
            JOIN dim_geo dge ON dge.geo_id=f.geo_id
            WHERE f.grid='NEM'
            GROUP BY 1,2,3,4,5,6,7,8,9
        """

facility_master = con.execute(query).df()
print("Facility master from DB:", facility_master.shape)
facility_master.head()


Facility master from DB: (446, 9)


Unnamed: 0,facility_code,facility_name,fuel_type,fuel_category,is_renewable,latitude,longitude,region,geo_resolution
0,25,McKay Creek Hydro,Hydro,HYDRO,True,-36.59861,144.678005,VIC,state
1,570,KARADOC SOLAR FARM,Solar,SOLAR,True,-36.59861,144.678005,VIC,state
2,676,Beryl Solar Farm,Solar,SOLAR,True,-31.875984,147.286949,NSW,state
3,521,Gullen Solar Pty Ltd,Solar,SOLAR,True,-31.875984,147.286949,NSW,state
4,643,Darlington Point Solar Farm Pty Ltd,Solar,SOLAR,True,-31.875984,147.286949,NSW,state


### <b> <span style="color:pink">1.3 Match Facilities from NGER (DB) and OpenElectrivity (API)</span></b>


The following cell sets up the matching framework that standardises and prepares both datasets (database and API) for subsequent matching attempts.  
It defines helper functions to clean names, align region codes, track progress, and prevent duplicate matches across attempts.  
This framework is executed once at the start and serves as the foundation for all following matching strategy cells.

In [9]:
# Matching Framework (API-based progress)

# Helpers
REGION_MAP = {"NSW":"NSW1","QLD":"QLD1","VIC":"VIC1","SA":"SA1","TAS":"TAS1"}
ALLOWED_NEM = {"NSW1","QLD1","VIC1","SA1","TAS1"}

def normalise_name(s: str) -> str:
    if not isinstance(s, str): 
        return ""
    s = s.lower()
    s = re.sub(r"\b(pty\s*ltd|limited|power\s*station|pp|plant|facility|unit|station|co|company|sf|wf)\b", " ", s)
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s

PREVIEW_COLS = [
    "facility_code_api",     # API_facility_code
    "facility_name_api",     # API_facility_name
    "API_name",              # API_name (raw)
    "region_api",            # API_network_region
    "facility_name",         # DB_facility_name
    "fuel_type",             # DB_fuel_type
    "fuel_category",         # DB_fuel_category
    "is_renewable",          # DB_is_renewable
]

# Build API working frame
api_raw = facilities_df.copy()

api_work = api_raw.rename(columns={
    "facility_code": "facility_code_api",
    "facility_name": "facility_name_api",
    "network_region": "region_api",
    "lat": "lat_api",
    "lng": "lng_api",
}).copy()

# keep a raw name column for display (API_name)
if "facility_name_api" in api_work.columns:
    api_work["API_name"] = api_work["facility_name_api"]
elif "name" in api_raw.columns:
    api_work["API_name"] = api_raw["name"]
else:
    api_work["API_name"] = pd.NA

api_keep = ["facility_code_api","facility_name_api","API_name","region_api","lat_api","lng_api"]
api_work = api_work[[c for c in api_keep if c in api_work.columns]].drop_duplicates()

# normalised fields
api_work["facility_code_api"] = api_work["facility_code_api"].astype(str).str.upper().str.strip()
api_work["name_norm_api"]     = api_work["facility_name_api"].map(normalise_name) if "facility_name_api" in api_work.columns else ""
api_work["region_api"]        = api_work["region_api"].astype(str).str.upper().str.strip()

# Build DB working frame (filtered to NEM only)
db_work = facility_master.copy()

# standardise for joins
db_work["facility_code_str"] = db_work["facility_code"].astype(str).str.upper().str.strip()
db_work["region_nem"]        = (
    db_work["region"].astype(str).str.upper().str.strip().map(REGION_MAP).fillna(db_work["region"])
)
db_work["name_norm_db"]      = db_work["facility_name"].map(normalise_name)

# filter DB to NEM universe
db_work = db_work.loc[db_work["region_nem"].isin(ALLOWED_NEM)].copy()

# tracking columns
for col in ["matched_api_code","matched_strategy"]:
    if col not in db_work.columns:
        db_work[col] = pd.NA

# ---------- Global registry to prevent reuse of API facility codes ----------
# Build from existing matches so rerunning framework keeps state
_existing_used = set(
    db_work.loc[db_work["matched_api_code"].notna(), "matched_api_code"]
          .astype(str)
          .tolist()
)

if "used_api_codes" in globals() and isinstance(used_api_codes, set):
    used_api_codes |= _existing_used
else:
    used_api_codes = set(_existing_used)

# ---------- Utilities ----------
def _progress(label: str):
    """Print API-based progress only."""
    total_api = len(api_work)
    used_api  = len(used_api_codes)
    remaining_api = total_api - used_api
    pct_api = 0.0 if total_api == 0 else used_api / total_api * 100.0
    print(f"[{label}] API: total={total_api}, matched={used_api}, remaining={remaining_api}, progress={pct_api:.2f}%")

def _preview(label: str, joined, n=10):
    for c in PREVIEW_COLS:
        if c not in joined.columns:
            joined[c] = pd.NA
    out = joined[PREVIEW_COLS].copy()
    print(f"\n[{label}] Preview of newly matched rows ({len(out)} rows; showing up to {n}):")
    display(out.head(n))
    return out

def _apply_matches(label: str, candidates, api_code_col="facility_code_api"):
    """
    Apply dedup-safe matches back to db_work and display one compact preview table.
    - processes only DB rows still unmatched,
    - drops NaN API codes,
    - refuses reusing API codes already applied,
    - reports API-only progress and duplicate diagnostics,
    - PREVIEW ONLY rows that were actually applied.
    """
    global db_work, api_work, used_api_codes

    # only DB rows not yet matched
    candidates = candidates.loc[db_work["matched_api_code"].isna()].copy()

    # drop NaN API codes early
    before_nan = len(candidates)
    candidates = candidates[candidates[api_code_col].notna()].copy()
    dropped_nan = before_nan - len(candidates)

    # avoid reusing API codes
    before_used = len(candidates)
    candidates = candidates[~candidates[api_code_col].isin(used_api_codes)].copy()
    dropped_used = before_used - len(candidates)

    if candidates.empty:
        print(f"[{label}] No new matches. (dropped_nan={dropped_nan}, dropped_used={dropped_used})")
        _progress(label)
        return pd.DataFrame(columns=PREVIEW_COLS)

    # apply matches and remember which DB indices actually got applied
    applied_idx = []
    for idx, row in candidates.iterrows():
        api_code = row[api_code_col]
        if pd.isna(api_code) or api_code in used_api_codes:
            continue
        db_work.at[idx, "matched_api_code"] = api_code
        db_work.at[idx, "matched_strategy"] = label
        used_api_codes.add(api_code)
        applied_idx.append(idx)

    if not applied_idx:
        print(f"[{label}] All candidates were skipped after guards. (dropped_nan={dropped_nan}, dropped_used={dropped_used})")
        _progress(label)
        return pd.DataFrame(columns=PREVIEW_COLS)

    # build preview ONLY for applied indices
    applied_slice = db_work.loc[applied_idx].merge(
        api_work, left_on="matched_api_code", right_on="facility_code_api", how="left"
    )

    _progress(label)
    dup_ids = db_work.loc[db_work["matched_api_code"].notna(), "matched_api_code"].duplicated(keep=False).sum()
    print(f"[{label}] duplicate API codes after guard: {dup_ids} | applied={len(applied_idx)} "
          f"| dropped_nan={dropped_nan} | dropped_used={dropped_used}")

    return _preview(label, applied_slice)

print("Matching framework ready (API-based progress).")


Matching framework ready (API-based progress).


In [10]:
# Attempt 1 — Exact match by facility_code

assert all(k in globals() for k in ["db_work", "api_work", "_apply_matches"]), \
    "Run the Matching Framework cell first."

label = "Attempt 1: exact facility_code"

# Consider only DB rows not matched yet
unmatched = db_work.loc[db_work["matched_api_code"].isna(), ["facility_code_str"]].copy()

# Build candidates by exact code equality (DB facility_code_str == API facility_code_api)
candidates = unmatched.merge(
    api_work[["facility_code_api"]],
    left_on="facility_code_str",
    right_on="facility_code_api",
    how="left"
).set_index(unmatched.index)[["facility_code_api"]]

# Apply matches and show a single compact preview table
_ = _apply_matches(label, candidates)


[Attempt 1: exact facility_code] No new matches. (dropped_nan=445, dropped_used=0)
[Attempt 1: exact facility_code] API: total=419, matched=0, remaining=419, progress=0.00%


In [11]:
# Attempt 2 — Exact name_norm + same region

assert all(k in globals() for k in ["db_work", "api_work", "_apply_matches"]), \
    "Run the Matching Framework cell first."

label = "Attempt 2: exact name_norm + region"

# Work only on DB rows not matched yet
left = db_work.loc[db_work["matched_api_code"].isna(), ["name_norm_db", "region_nem"]].copy()
left["key"] = left["name_norm_db"].fillna("") + "|" + left["region_nem"].fillna("")
left["db_idx"] = left.index  # keep original DB index

# Prepare API side with the same composite key and make keys unique
right = api_work.loc[:, ["name_norm_api", "region_api", "facility_code_api"]].copy()
right["key"] = right["name_norm_api"].fillna("") + "|" + right["region_api"].fillna("")
right = right.drop_duplicates(subset=["key"], keep="first")  # enforce 1:1 mapping per key

# Left-join using the composite key; keep original DB index
joined = left.merge(right[["key", "facility_code_api"]], on="key", how="left")
joined = joined.set_index("db_idx")

# Build candidates and apply matches
candidates = joined[["facility_code_api"]]

_ = _apply_matches(label, candidates)

[Attempt 2: exact name_norm + region] API: total=419, matched=22, remaining=397, progress=5.25%
[Attempt 2: exact name_norm + region] duplicate API codes after guard: 0 | applied=22 | dropped_nan=422 | dropped_used=0

[Attempt 2: exact name_norm + region] Preview of newly matched rows (22 rows; showing up to 10):


Unnamed: 0,facility_code_api,facility_name_api,API_name,region_api,facility_name,fuel_type,fuel_category,is_renewable
0,HORNSDPR,Hornsdale Power Reserve,Hornsdale Power Reserve,SA1,HORNSDALE POWER RESERVE PTY LTD,Battery,STORAGE,False
1,LRSF,Longreach,Longreach,QLD1,Longreach SF,Solar,SOLAR,True
2,KENNEDY,Kennedy Energy Park,Kennedy Energy Park,QLD1,Kennedy Energy Park Pty Ltd,Wind,WIND,True
3,VBB,Victorian Big Battery,Victorian Big Battery,VIC1,VICTORIAN BIG BATTERY PTY LTD,Battery,STORAGE,False
4,WRWF1,White Rock Wind Farm,White Rock Wind Farm,NSW1,White Rock Wind Farm Pty Ltd,Wind,WIND,True
5,BOLIVAR,Bolivar,Bolivar,SA1,Bolivar Power Station,Gas,GAS,False
6,MIDLDPS,Midlands,Midlands,TAS1,Midlands Power Station,Hydro,HYDRO,True
7,BARRON,Barron Gorge,Barron Gorge,QLD1,Barron Gorge Power Station,Hydro,HYDRO,True
8,WSTWYSF,West Wyalong,West Wyalong,NSW1,West Wyalong SF,Solar,SOLAR,True
9,KAREEYA,Kareeya,Kareeya,QLD1,Kareeya Power Station,Hydro,HYDRO,True


In [12]:
# Attempt 3 — Fuzzy name within same region

assert all(k in globals() for k in ["db_work", "api_work", "_apply_matches"]), \
    "Run the Matching Framework cell first."


label = "Attempt 3: fuzzy name within region"
threshold = 90

# Work only on DB rows not matched yet
unmatched = db_work.loc[db_work["matched_api_code"].isna(), ["name_norm_db", "region_nem"]].copy()

rows = []
for region, chunk in unmatched.groupby("region_nem"):
    # API candidates within same region, with non-empty normalised names
    candidates_api = api_work.query("region_api == @region and name_norm_api != ''").copy()
    if candidates_api.empty:
        continue

    # Optionally exclude API codes already used to reduce collisions
    if "used_api_codes" in globals() and len(used_api_codes) > 0:
        candidates_api = candidates_api[~candidates_api["facility_code_api"].isin(used_api_codes)]

    if candidates_api.empty:
        continue

    names_api = candidates_api["name_norm_api"].tolist()
    idx_api   = candidates_api.index.tolist()

    for i, row in chunk.iterrows():
        q = row["name_norm_db"]
        if not q:
            continue
        best = process.extractOne(q, names_api, scorer=fuzz.token_set_ratio)
        if best and best[1] >= threshold:
            api_row = candidates_api.loc[idx_api[best[2]]]
            rows.append((i, api_row["facility_code_api"]))

# Build candidates DataFrame and apply matches
if rows:
    candidates = pd.DataFrame(rows, columns=["idx","facility_code_api"]).set_index("idx")
    _ = _apply_matches(label, candidates)
else:
    print(f"[{label}] No matches found.")


[Attempt 3: fuzzy name within region] API: total=419, matched=24, remaining=395, progress=5.73%
[Attempt 3: fuzzy name within region] duplicate API codes after guard: 0 | applied=2 | dropped_nan=0 | dropped_used=0

[Attempt 3: fuzzy name within region] Preview of newly matched rows (2 rows; showing up to 10):


Unnamed: 0,facility_code_api,facility_name_api,API_name,region_api,facility_name,fuel_type,fuel_category,is_renewable
0,OAKY2,Oaky Creek 2,Oaky Creek 2,QLD1,Oaky Creek I Power Station,Waste Coal Mine Gas,GAS,False
1,OAKEY1SF,Oakey Solar Farm,Oakey Solar Farm,QLD1,Oakey 1 Solar Farm,Solar,SOLAR,True


In [13]:
# Attempt 4: Subset-name (containment) match on name_norm, optional region gate

LABEL = "A4_subset_norm_unique"

# Unmatched DB
db_unm = db_work.loc[db_work["matched_api_code"].isna(), ["name_norm_db"]].copy()

# API keys
api_cols = ["facility_code_api", "name_norm_api"]
use_region = ("region_api" in api_work.columns) and ("region_nem" in db_work.columns)
if use_region:
    db_unm["region_nem"] = db_work.loc[db_unm.index, "region_nem"]
    api_cols.append("region_api")

api_k = api_work[api_cols].drop_duplicates().copy()

def _candidates_for(db_row):
    """Return candidate API rows (as DataFrame) for one DB row by subset-name rule."""
    s_db = db_row["name_norm_db"]
    if not isinstance(s_db, str) or not s_db:
        return api_k.iloc[0:0]

    # Region gate if available
    pool = api_k
    if use_region:
        pool = pool[pool["region_api"] == db_row["region_nem"]]

    # Containment both directions, require min length >= 6 to avoid trivial hits
    mask = (
        (pool["name_norm_api"].str.contains(s_db, na=False)) |
        (pool["name_norm_api"].apply(lambda s: isinstance(s, str) and s_db.find(s) != -1))
    ) & (pool["name_norm_api"].str.len().fillna(0).clip(upper=10**9).where(lambda L: L >= 6, False) |
         (pd.Series([len(s_db) >= 6]*len(pool), index=pool.index)))

    return pool[mask][["facility_code_api"]]

# Build matches: only accept unique
rows = []
for idx, r in db_unm.iterrows():
    cand = _candidates_for(r)
    if len(cand) == 1:
        rows.append((idx, cand["facility_code_api"].iloc[0]))

if rows:
    candidates = pd.DataFrame(rows, columns=["_idx", "facility_code_api"]).set_index("_idx")
    # ensure API code unique (one DB per API)
    vc = candidates["facility_code_api"].value_counts()
    candidates = candidates[candidates["facility_code_api"].isin(vc[vc == 1].index)]
    _apply_matches(LABEL, candidates[["facility_code_api"]])
else:
    _apply_matches(LABEL, pd.DataFrame(columns=["facility_code_api"]))


[A4_subset_norm_unique] API: total=419, matched=148, remaining=271, progress=35.32%
[A4_subset_norm_unique] duplicate API codes after guard: 0 | applied=124 | dropped_nan=0 | dropped_used=4

[A4_subset_norm_unique] Preview of newly matched rows (124 rows; showing up to 10):


Unnamed: 0,facility_code_api,facility_name_api,API_name,region_api,facility_name,fuel_type,fuel_category,is_renewable
0,KARSF,Karadoc,Karadoc,VIC1,KARADOC SOLAR FARM,Solar,SOLAR,True
1,BERYLSF,Beryl,Beryl,NSW1,Beryl Solar Farm,Solar,SOLAR,True
2,RUGBYR,Rugby Run,Rugby Run,QLD1,Adani Rugby Run Solar Farm,Solar,SOLAR,True
3,BAPS,Banimboola,Banimboola,VIC1,Banimboola Hydro,Hydro,HYDRO,True
4,HD1WF,Hawkesdale,Hawkesdale,VIC1,Hawkesdale Windfarm,Wind,WIND,True
5,SALTCRK,Salt Creek,Salt Creek,VIC1,Salt Creek Wind Farm,Wind,WIND,True
6,SAPHWF1,Sapphire,Sapphire,NSW1,Sapphire Wind Farm,Wind,WIND,True
7,COLONGRA,Colongra,Colongra,NSW1,Colongra PS,Gas,GAS,False
8,HAYMSF,Hayman,Hayman,QLD1,The Hayman Solar Farm Trust,Solar,SOLAR,True
9,WINTSF1,Winton,Winton,VIC1,Winton Solar Farm,Solar,SOLAR,True


In [14]:
# Attempt 5: One rare-token overlap (>=1 uncommon word), optional region gate, 1:1, dedup-safe

LABEL = "A5_one_rare_token"

STOP = {
    "wind","solar","gas","bio","biomass","diesel","hydro","coal","battery",
    "farm","power","station","plant","unit","pty","ltd","limited","company",
    "energy","project","park","river","creek","hill","range","ps","pp","sf","wf","lfg"
}
RARE_MAX = 3   # a token is rare if it appears <= 3 times globally
MINLEN   = 4   # ignore very short tokens

def toks(s: str) -> set:
    if not isinstance(s, str) or not s:
        return set()
    return {t for t in re.findall(r"[a-z0-9]+", s.lower()) if len(t) >= MINLEN and t not in STOP}

# Unmatched DB slice (keep DB index)
db_unm = db_work.loc[db_work["matched_api_code"].isna(), ["facility_name"]].copy()
db_unm["tok_db"] = db_unm["facility_name"].map(toks)

# API tokens
api_k = api_work[["facility_code_api","API_name"]].drop_duplicates().copy()
api_k["tok_api"] = api_k["API_name"].map(toks)

# Global token frequencies to identify "rare" tokens
all_tokens = [t for s in db_unm["tok_db"] for t in s] + [t for s in api_k["tok_api"] for t in s]
freq = Counter(all_tokens)

def rare_anchor_token(token_set: set) -> str:
    rare = [t for t in token_set if freq.get(t, 0) <= RARE_MAX]
    if not rare:
        return ""
    rare.sort(key=lambda t: (freq[t], -len(t), t))  # rarest, then longer, then lexicographic
    return rare[0]

db_unm["anchor"] = db_unm["tok_db"].map(rare_anchor_token)
api_k = api_k.assign(anchor=api_k["tok_api"].map(rare_anchor_token))

# Optional region gate
use_region = ("region_api" in api_work.columns) and ("region_nem" in db_work.columns)
if use_region:
    db_unm["region_nem"] = db_work.loc[db_unm.index, "region_nem"]
    api_k = api_k.merge(
        api_work[["facility_code_api","region_api"]].drop_duplicates(),
        on="facility_code_api", how="left"
    )

# Drop rows without an anchor (no rare token found)
db_unm = db_unm[db_unm["anchor"] != ""].copy()
api_k  = api_k[api_k["anchor"]  != ""].copy()

# Blocked join on the rare anchor (and region if available), preserving DB index
if use_region:
    cand = db_unm.join(
        api_k.set_index(["anchor","region_api"])[["facility_code_api","tok_api"]],
        on=["anchor","region_nem"], how="left"
    )
else:
    cand = db_unm.join(
        api_k.set_index("anchor")[["facility_code_api","tok_api"]],
        on="anchor", how="left"
    )

# Keep only rows with an API candidate
cand = cand.dropna(subset=["facility_code_api"]).copy()

# Ensure both token columns are sets (avoid NaN -> float)
cand["tok_db"]  = cand["tok_db"].apply(lambda s: s if isinstance(s, set) else set())
cand["tok_api"] = cand["tok_api"].apply(lambda s: s if isinstance(s, set) else set())

if cand.empty:
    _ = _apply_matches(LABEL, pd.DataFrame(columns=["facility_code_api"]))
else:
    # Compute overlap size and jaccard for tie-break
    overlap = []
    jaccard = []
    for s_db, s_api in zip(cand["tok_db"], cand["tok_api"]):
        inter = len(s_db & s_api)
        uni   = len(s_db | s_api) if (s_db or s_api) else 1
        overlap.append(inter)
        jaccard.append(inter/uni)
    cand["overlap"] = overlap
    cand["jaccard"] = jaccard

    # Require at least 1 shared rare/informative token
    cand = cand[cand["overlap"] >= 1].copy()

    if cand.empty:
        _ = _apply_matches(LABEL, pd.DataFrame(columns=["facility_code_api"]))
    else:
        # Pick single best per DB row (by overlap then jaccard)
        cand["rank_key"] = list(zip(cand["overlap"], cand["jaccard"]))
        best_idx = cand.groupby(cand.index)["rank_key"].idxmax()
        best = cand.loc[best_idx].copy()

        # Enforce 1:1 on API side (keep the best globally per API code)
        best = best.sort_values(["overlap","jaccard"], ascending=[False, False])
        best = best.drop_duplicates(subset=["facility_code_api"], keep="first")

        _ = _apply_matches(LABEL, best[["facility_code_api"]])


[A5_one_rare_token] API: total=419, matched=206, remaining=213, progress=49.16%
[A5_one_rare_token] duplicate API codes after guard: 0 | applied=58 | dropped_nan=0 | dropped_used=15

[A5_one_rare_token] Preview of newly matched rows (58 rows; showing up to 10):


Unnamed: 0,facility_code_api,facility_name_api,API_name,region_api,facility_name,fuel_type,fuel_category,is_renewable
0,DPNTB,Darlington Point,Darlington Point,NSW1,Darlington Point Solar Farm Pty Ltd,Solar,SOLAR,True
1,DPNTB,Darlington Point,Darlington Point,NSW1,Darlington Point Solar Farm Pty Ltd,Solar,SOLAR,True
2,LK_ECHO,Lake Echo,Lake Echo,TAS1,Hydro Power Station - Lake Echo,Hydro,HYDRO,True
3,LUCAS2S2,Lucas Heights 2,Lucas Heights 2,NSW1,Lucas Heights 2 LFG Power Station,Landfill Gas,BIO,True
4,BOCOROCK,Boco Rock,Boco Rock,NSW1,Boco Rock Wind Farm,Wind,WIND,True
5,MORTLK,Mortlake,Mortlake,VIC1,Mortlake South Wind Farm,Wind,WIND,True
6,JBUTTERS,John Butters,John Butters,TAS1,Hydro Power Station - John Butters,Hydro,HYDRO,True
7,MBAHNTH,Moranbah North,Moranbah North,QLD1,Moranbah North CMM Power Station,Waste Coal Mine Gas,GAS,False
8,CHYTWF,Cherry Tree,Cherry Tree,VIC1,Cherry Tree Wind Farm,Wind,WIND,True
9,GRANGEAV,Grange Avenue,Grange Avenue,NSW1,Grange Avenue LFG Power Station,Landfill Gas,BIO,True


In [15]:
# Attempt 6: Relaxed fuzzy names

LABEL = "A6_token_jaccard_070_blk_long"

# Rebuild the same candidates quickly (uses existing db_work/api_work)
db_unm = db_work.loc[db_work["matched_api_code"].isna(), ["facility_name","region_nem"]].copy()
api_k  = api_work[["facility_code_api","API_name","facility_name_api","region_api"]].drop_duplicates().copy()

# Token helpers
STOP = {"wind","solar","gas","bio","biomass","diesel","hydro","coal","battery","farm","power","station","plant","unit","pty","ltd","limited","company","energy","project","park","river","creek","hill","range","ps","pp","sf","wf","lfg","the","and"}
tok = lambda s: {t for t in re.findall(r"[a-z0-9]+", str(s).lower()) if len(t) >= 3 and t not in STOP}
longest = lambda ss: (max(ss, key=len) if ss else "")

db_unm["tok_db"] = db_unm["facility_name"].map(tok)
api_k["tok_api"] = api_k["API_name"].map(tok)
db_unm["anchor"] = db_unm["tok_db"].map(longest)
api_k["anchor"]  = api_k["tok_api"].map(longest)

db_unm = db_unm[db_unm["anchor"] != ""].copy()
api_k  = api_k[api_k["anchor"]  != ""].copy()

cand = db_unm.join(
    api_k.set_index(["region_api","anchor"])[["facility_code_api","API_name","facility_name_api","tok_api"]],
    on=["region_nem","anchor"], how="left"
).dropna(subset=["facility_code_api"]).copy()

if cand.empty:
    _ = _apply_matches(LABEL, pd.DataFrame(columns=["facility_code_api"]))
else:
    # Jaccard + overlap
    jacc, ov = [], []
    for s_db, s_api in zip(cand["tok_db"], cand["tok_api"]):
        inter = len(s_db & s_api)
        union = len(s_db | s_api) if (s_db or s_api) else 1
        ov.append(inter); jacc.append(inter/union)
    cand["overlap"] = ov; cand["jaccard"] = jacc

    cand = cand[(cand["overlap"] >= 2) & (cand["jaccard"] >= 0.70)].copy()
    if cand.empty:
        _ = _apply_matches(LABEL, pd.DataFrame(columns=["facility_code_api"]))
    else:
        # Best per DB row
        cand["rank_key"] = list(zip(cand["jaccard"], cand["overlap"]))
        best_idx = cand.groupby(cand.index)["rank_key"].idxmax()
        best = cand.loc[best_idx].copy()

        # 1) enforce unique API code
        best = best.sort_values(["jaccard","overlap"], ascending=[False, False])
        best = best.drop_duplicates(subset=["facility_code_api"], keep="first")

        # 2) enforce unique API *name* (this removes duplicate facility_name_api in preview)
        if "facility_name_api" in best.columns:
            best = best.drop_duplicates(subset=["facility_name_api"], keep="first")

        _ = _apply_matches(LABEL, best[["facility_code_api"]])


[A6_token_jaccard_070_blk_long] API: total=419, matched=209, remaining=210, progress=49.88%
[A6_token_jaccard_070_blk_long] duplicate API codes after guard: 0 | applied=3 | dropped_nan=0 | dropped_used=6

[A6_token_jaccard_070_blk_long] Preview of newly matched rows (3 rows; showing up to 10):


Unnamed: 0,facility_code_api,facility_name_api,API_name,region_api,facility_name,fuel_type,fuel_category,is_renewable
0,DDPS1,Darling Downs,Darling Downs,QLD1,Darling Downs Solar Farm,Wind,WIND,True
1,MUWAWF,Murra Warra,Murra Warra,VIC1,Murra Warra II Wind Farm,Wind,WIND,True
2,SNOWSTH,Snowtown South,Snowtown South,SA1,Snowtown South Wind Farm Pty Ltd,Wind,WIND,True


In [16]:
# Attempt 7: Relaxed fuzzy within same region (threshold/partial >= 85) + containment fallback,

LABEL = "A7_relaxed_fuzzy_region_85"

# Unmatched DB + required region cols
db_unm = db_work.loc[db_work["matched_api_code"].isna(), ["name_norm_db", "region_nem"]].copy()
api_k  = api_work[["facility_code_api", "name_norm_api", "facility_name_api", "region_api"]].drop_duplicates().copy()

if db_unm.empty or api_k.empty or ("region_nem" not in db_unm.columns) or ("region_api" not in api_k.columns):
    _ = _apply_matches(LABEL, pd.DataFrame(columns=["facility_code_api"]))
else:
    # light blocking by 4-char prefix to cut pairings
    db_unm["blk4"] = db_unm["name_norm_db"].astype(str).str[:4]
    api_k["blk4"]  = api_k["name_norm_api"].astype(str).str[:4]

    cand = db_unm.join(
        api_k.set_index(["region_api","blk4"])[["facility_code_api","name_norm_api","facility_name_api"]],
        on=["region_nem","blk4"], how="left"
    ).dropna(subset=["facility_code_api"]).copy()

    if cand.empty:
        _ = _apply_matches(LABEL, pd.DataFrame(columns=["facility_code_api"]))
    else:
        # elementwise arrays (ensure pure strings)
        dn = cand["name_norm_db"].astype(str).tolist()
        an = cand["name_norm_api"].astype(str).tolist()

        # relaxed fuzzy: max(token_set_ratio, partial_ratio)
        base = np.array([max(fuzz.token_set_ratio(d, a), fuzz.partial_ratio(d, a)) for d, a in zip(dn, an)], dtype=float)

        # containment fallback computed elementwise (no np.char)
        contain = np.array([(a.find(d) != -1) or (d.find(a) != -1) for d, a in zip(dn, an)], dtype=bool)
        min_len = np.array([min(len(d), len(a)) for d, a in zip(dn, an)], dtype=int)

        score = np.where((base < 85) & contain & (min_len >= 5), 85.0, base)
        cand["score"] = score

        # pick single best per DB row
        best = cand.loc[cand.groupby(cand.index)["score"].idxmax()].copy()
        best = best[best["score"] >= 85].copy()

        if best.empty:
            _ = _apply_matches(LABEL, pd.DataFrame(columns=["facility_code_api"]))
        else:
            # enforce uniqueness on API code then API name
            best = best.sort_values(["score"], ascending=False).drop_duplicates(subset=["facility_code_api"], keep="first")
            if "facility_name_api" in best.columns:
                best = best.drop_duplicates(subset=["facility_name_api"], keep="first")

            _ = _apply_matches(LABEL, best[["facility_code_api"]])


[A7_relaxed_fuzzy_region_85] API: total=419, matched=240, remaining=179, progress=57.28%
[A7_relaxed_fuzzy_region_85] duplicate API codes after guard: 0 | applied=31 | dropped_nan=0 | dropped_used=48

[A7_relaxed_fuzzy_region_85] Preview of newly matched rows (31 rows; showing up to 10):


Unnamed: 0,facility_code_api,facility_name_api,API_name,region_api,facility_name,fuel_type,fuel_category,is_renewable
0,WOOLNTH1,Woolnorth,Woolnorth,TAS1,Woolnorth Bluff Point Wind Farm Pty Ltd,Wind,WIND,True
1,SHOALHAV,Shoalhaven,Shoalhaven,NSW1,Shoalhaven Landfill Generation,Landfill Gas,BIO,True
2,EILDONPD,Eildon,Eildon,VIC1,Eildon Hydro,Hydro,HYDRO,True
3,SNOWTOWN,Snowtown,Snowtown,SA1,Snowtown Wind Farm,Wind,WIND,True
4,AGLHAL,Hallett,Hallett,SA1,Hallett (Stage 2) Wind Farm,Wind,WIND,True
5,MOREESF,Moree,Moree,NSW1,Moree Solar Farm,Solar,SOLAR,True
6,MANSLR,Manildra,Manildra,NSW1,Manildra Prop Pty ltd as trustee for Manildra ...,Solar,SOLAR,True
7,SITHE,Smithfield,Smithfield,NSW1,Smithfield Power Partnership,Gas,GAS,False
8,TARONG,Tarong,Tarong,QLD1,Tarong Power Stations,Black Coal,COAL,False
9,OAKEY2SF,Oakey 2,Oakey 2,QLD1,Oakey 2 Solar Farm,Solar,SOLAR,True


In [17]:
# State sync (static results)
used_api_codes = set(
    db_work.loc[db_work["matched_api_code"].notna(), "matched_api_code"]
          .astype(str)
          .tolist()
)

In [18]:
# Attempt 8: two-token overlap + region

assert all(k in globals() for k in ["db_work", "api_work", "_apply_matches"]), \
    "Run the Matching Framework cell first."

label = "Attempt 8: two-token overlap + region)"

# deterministic behaviour across runs
np.random.seed(8)

# Lightweight tokenizer (same across attempts)
_STOP = {
    "power","station","plant","facility","unit","units","energy","generator","gen",
    "pty","ltd","company","co","project","site","phase","block","stage"
}
_TOKEN = re.compile(r"[a-z0-9]+")
def _tok(s):
    if not isinstance(s, str): 
        return set()
    toks = _TOKEN.findall(s.lower())
    return {t for t in toks if t not in _STOP and (t.isdigit() or len(t) > 1)}

# DB side: only unmatched rows
left = db_work.loc[db_work["matched_api_code"].isna(), ["facility_name","region_nem"]].copy()
left["__tok"] = left["facility_name"].map(_tok)
left["__reg"] = left["region_nem"].astype(str)
left = left[left["__tok"].map(bool)]
left = left.reset_index().rename(columns={"index":"db_idx"})
left_ex = left[["db_idx","__reg"]].join(left["__tok"].explode().rename("token"))

# API side
right = api_work.loc[:, ["API_name","region_api","facility_code_api"]].copy()
right["__tok"] = right["API_name"].map(_tok)
right["__reg"] = right["region_api"].astype(str)
right = right[right["__tok"].map(bool)]
right_ex = right[["facility_code_api","__reg"]].join(right["__tok"].explode().rename("token"))

# Candidate pairs via token + same region
cand = left_ex.merge(right_ex, on=["token","__reg"], how="inner")

# Token rarity weighting for tie-breaks
tok_freq = Counter(right_ex["token"].tolist())
cand["w"] = cand["token"].map(lambda t: 1.0 / (1 + tok_freq.get(t, 0)))

# Aggregate per (DB row, API facility)
agg = (
    cand.groupby(["db_idx","facility_code_api"], as_index=False)
         .agg(overlap=("token","nunique"), w_overlap=("w","sum"))
)

# Keep overlap ≥ 2
agg = agg[agg["overlap"] >= 2].copy()

# Add deterministic tie-breakers
api_len = (
    right.drop_duplicates(subset=["facility_code_api"])
         .set_index("facility_code_api")["__tok"]
         .map(len)
         .to_dict()
)
db_len = left.set_index("db_idx")["__tok"].map(len).to_dict()

agg["db_len"]  = agg["db_idx"].map(db_len)
agg["api_len"] = agg["facility_code_api"].map(api_len).fillna(0).astype(int)
agg["len_gap"] = (agg["db_len"] - agg["api_len"]).abs()

# Deterministic sort: prefer higher overlap, then w_overlap, smaller len_gap, alphabetical API code
agg = agg.sort_values(
    ["overlap", "w_overlap", "len_gap", "facility_code_api"],
    ascending=[False, False, True, True],
    kind="mergesort"
)

# Drop duplicates deterministically
best = agg.drop_duplicates(subset=["db_idx"], keep="first").set_index("db_idx")

# Apply matches (framework ensures dedup + progress)
candidates = best[["facility_code_api"]]
_ = _apply_matches(label, candidates)


[Attempt 8: two-token overlap + region)] API: total=419, matched=232, remaining=187, progress=55.37%
[Attempt 8: two-token overlap + region)] duplicate API codes after guard: 0 | applied=2 | dropped_nan=0 | dropped_used=26

[Attempt 8: two-token overlap + region)] Preview of newly matched rows (2 rows; showing up to 10):


Unnamed: 0,facility_code_api,facility_name_api,API_name,region_api,facility_name,fuel_type,fuel_category,is_renewable
0,RPCG,Rocky Point Cogen,Rocky Point Cogen,QLD1,Rocky Point Sugar Mill Cogeneration,Bagasse,BIO,True
1,BHB,Broken Hill Battery,Broken Hill Battery,NSW1,Essential Energy Generation Facility - Broken ...,Diesel,LIQUID,False


In [19]:
# Attempt 9: first-word match + region

np.random.seed(9)

assert all(k in globals() for k in ["db_work", "api_work", "_apply_matches"]), \
    "Run the Matching Framework cell first."

label = "Attempt 9: first-word match + region"

def _first_word(s):
    if not isinstance(s, str):
        return ""
    s = s.lower().strip()
    s = re.sub(r"[^a-z0-9\s]+", " ", s)
    parts = s.split()
    return parts[0] if parts else ""

# DB side (only unmatched)
left = db_work.loc[db_work["matched_api_code"].isna(), ["facility_name", "region_nem"]].copy()
left["first_word"] = left["facility_name"].map(_first_word)
left["region_nem"] = left["region_nem"].astype(str).str.strip().str.upper()
left = left.reset_index().rename(columns={"index": "db_idx"})

# API side
right = api_work.loc[:, ["API_name", "region_api", "facility_code_api"]].copy()
right["first_word"] = right["API_name"].map(_first_word)
right["region_api"] = right["region_api"].astype(str).str.strip().str.upper()

# Make (first_word, region) unique deterministically:
# sort stable, then keep the first facility_code_api per pair
right_unique = (
    right.sort_values(
        ["first_word", "region_api", "facility_code_api"],
        ascending=[True, True, True],
        kind="mergesort"
    )
    .drop_duplicates(subset=["first_word", "region_api"], keep="first")
)

# Deterministic merge: now right side is unique per key -> m:1 is valid
joined = (
    left.merge(
        right_unique[["first_word", "region_api", "facility_code_api"]],
        left_on=["first_word", "region_nem"],
        right_on=["first_word", "region_api"],
        how="left",
        sort=False,
        validate="m:1"
    )
    .sort_values(["db_idx"], kind="mergesort")
    .set_index("db_idx")
)

# Candidates aligned to db_work index
candidates = joined[["facility_code_api"]]

# Apply (framework handles dedup + progress/preview)
_ = _apply_matches(label, candidates)

[Attempt 9: first-word match + region] API: total=419, matched=235, remaining=184, progress=56.09%
[Attempt 9: first-word match + region] duplicate API codes after guard: 0 | applied=3 | dropped_nan=146 | dropped_used=62

[Attempt 9: first-word match + region] Preview of newly matched rows (3 rows; showing up to 10):


Unnamed: 0,facility_code_api,facility_name_api,API_name,region_api,facility_name,fuel_type,fuel_category,is_renewable
0,EILDON,Eildon,Eildon,VIC1,Eildon Pondage Hydro,Hydro,HYDRO,True
1,SNOWNTH,Snowtown North,Snowtown North,SA1,SNOWTOWN WIND FARM STAGE 2,Wind,WIND,True
2,SHOAL,Shoalhaven Starches,Shoalhaven Starches,NSW1,Shoalhaven Scheme,Hydro,HYDRO,True


In [60]:
# Use API frame from the framework
api_all = api_work.copy()
api_all["facility_code_api"] = api_all["facility_code_api"].astype(str).str.upper().str.strip()

# Prepare DB slice
db_cols_to_attach = [
    "matched_api_code",
    "facility_name",
    "fuel_type",
    "fuel_category",
    "is_renewable",
    "region_nem",
    "matched_strategy",
]
db_slice = db_work.loc[:, db_cols_to_attach].copy()
db_slice["matched_api_code"] = db_slice["matched_api_code"].astype(str).str.upper().str.strip()

# Deduplicate DB keys
dup_cnt = db_slice["matched_api_code"].duplicated(keep=False).sum()
if dup_cnt > 0:
    db_slice = db_slice.drop_duplicates(subset=["matched_api_code"], keep="first")

# Left-join: All API rows preserved, DB columns attached where matched
final_df = api_all.merge(
    db_slice,
    left_on="facility_code_api",
    right_on="matched_api_code",
    how="left",
    validate="m:1"
)

# Convenience flag & ordering
final_df["matched_flag"] = final_df["matched_api_code"].notna()
final_df = final_df.sort_values(["matched_flag", "facility_code_api"], ascending=[False, True])

# Save to CSV
out_dir = "DATA/OUTPUT"
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "api_db_join_attempt.csv")
final_df.to_csv(out_path, index=False)

# Summary (API-based)
total_api = len(api_all)
matched_api = int(final_df["matched_flag"].sum())
progress_api = 0.0 if total_api == 0 else matched_api / total_api * 100.0

print(f"CSV saved → {out_path}")
print(f"API rows (expected 515): {total_api}")
print(f"Matched (API-based):    {matched_api} ({progress_api:.2f}%)")

display(final_df.head(10))


CSV saved → DATA/OUTPUT\api_db_join_attempt.csv
API rows (expected 515): 419
Matched (API-based):    235 (56.09%)


Unnamed: 0,facility_code_api,facility_name_api,API_name,region_api,lat_api,lng_api,name_norm_api,matched_api_code,facility_name,fuel_type,fuel_category,is_renewable,region_nem,matched_strategy,matched_flag
147,AGLHAL,Hallett,Hallett,SA1,-33.34931,138.752633,hallett,AGLHAL,Hallett (Stage 2) Wind Farm,Wind,WIND,True,SA1,A7_relaxed_fuzzy_region_85,True
184,AGLSITA,Kemps Creek,Kemps Creek,NSW1,-33.874591,150.761858,kempscreek,AGLSITA,Kemps Creek Landfill,Landfill Gas,BIO,True,NSW1,A5_one_rare_token,True
3,APPIN,Appin,Appin,NSW1,-34.210868,150.792711,appin,APPIN,Appin CSM Power Station,Coal Seam Methane,GAS,False,NSW1,A5_one_rare_token,True
5,AVLSF,Avonlie,Avonlie,NSW1,-34.919115,146.60954,avonlie,AVLSF,Avonlie Solar Farm,Solar,SOLAR,True,NSW1,A4_subset_norm_unique,True
7,BAKING,Baking Board,Baking Board,QLD1,-26.721166,150.55417,bakingboard,BAKING,Baking Board Solar Farm,Solar,SOLAR,True,QLD1,A4_subset_norm_unique,True
9,BALBESS,Ballarat,Ballarat,VIC1,-37.567452,143.852741,ballarat,BALBESS,Ballarat Solar Park,Solar,SOLAR,True,VIC1,A5_one_rare_token,True
10,BANGOWF,Bango,Bango,NSW1,-34.767208,148.921499,bango,BANGOWF,Bango Wind Farm,Wind,WIND,True,NSW1,A4_subset_norm_unique,True
12,BANNSP,Bannerton,Bannerton,VIC1,-34.67263,142.750362,bannerton,BANNSP,Bannerton Solar Farm,Solar,SOLAR,True,VIC1,A4_subset_norm_unique,True
11,BAPS,Banimboola,Banimboola,VIC1,-36.534524,147.459788,banimboola,BAPS,Banimboola Hydro,Hydro,HYDRO,True,VIC1,A4_subset_norm_unique,True
13,BARCALDN,Barcaldine,Barcaldine,QLD1,-23.552171,145.314851,barcaldine,BARCALDN,Barcaldine Power Station Facility,Gas,GAS,False,QLD1,Attempt 2: exact name_norm + region,True


In [61]:
final_df = final_df.rename(columns={
    'facility_code_api': 'facility_code',
    'region_api': 'network_region',
    'lat_api': 'lat',
    'lng_api': 'lng'
})

final_df['facility_name'] = final_df.apply(
    lambda row: row['facility_name'] if row['matched_flag'] else row['facility_name_api'],
    axis=1
)

final_df = final_df[['facility_code', 'facility_name', 'network_region', 'lat', 'lng', 'fuel_type', 'matched_flag']]



For the non-matched rows, we will keep them but fill the missing columns (fuel_type) with OpenElectricity data.

In [62]:
save_dataset(final_df, "./DATA/EXTRACTED/electricity_facilities_matched.csv")

Saved: DATA\EXTRACTED\electricity_facilities_matched.csv


In [63]:
units_df = pd.read_csv("DATA/EXTRACTED/electricity_units_facilities.csv")
facilities_df = pd.read_csv("DATA/EXTRACTED/electricity_facilities_matched.csv")

units_df['capacity_registered'] = pd.to_numeric(units_df['capacity_registered'], errors='coerce')

units_max= (units_df
             .sort_values('capacity_registered', ascending=False)
             .groupby('facility_code')
             .first()
             .reset_index())

fueltech_to_fuel_type = {
    'battery_discharging': 'Battery',
    'battery_charging': 'Battery',
    'battery': 'Battery',
    'wind': 'Wind',
    'solar_utility': 'Solar',
    'gas_ocgt': 'Gas',
    'bioenergy_biogas': 'Bioenergy',
    'distillate': 'Distillate',
    'gas_wcmg': 'Gas',
    'hydro': 'Hydro',
    'gas_ccgt': 'Gas',
    'gas_recip': 'Gas',
    'coal_black': 'Black Coal',
    'bioenergy_biomass': 'Bioenergy',
    'coal_brown': 'BrownCoal',
    'gas_steam': 'Gas',
    'pumps': 'Pumps',
}

units_max['fuel_type_mapped'] = units_max['fueltech_id'].map(fueltech_to_fuel_type)

facilities_df = facilities_df.merge(
    units_max[['facility_code', 'fuel_type_mapped']], 
    on='facility_code', 
    how='left'
)

facilities_df['fuel_type'] = facilities_df['fuel_type'].fillna(facilities_df['fuel_type_mapped'])
facilities_df = facilities_df.drop(columns=['fuel_type_mapped'])

In [64]:
facilities_df

Unnamed: 0,facility_code,facility_name,network_region,lat,lng,fuel_type,matched_flag
0,AGLHAL,Hallett (Stage 2) Wind Farm,SA1,-33.349310,138.752633,Wind,True
1,AGLSITA,Kemps Creek Landfill,NSW1,-33.874591,150.761858,Landfill Gas,True
2,APPIN,Appin CSM Power Station,NSW1,-34.210868,150.792711,Coal Seam Methane,True
3,AVLSF,Avonlie Solar Farm,NSW1,-34.919115,146.609540,Solar,True
4,BAKING,Baking Board Solar Farm,QLD1,-26.721166,150.554170,Solar,True
...,...,...,...,...,...,...,...
414,YABULU,Townsville,QLD1,-19.200374,146.618265,Gas,False
415,YALLOURN,Yallourn W,VIC1,-38.177596,146.347508,BrownCoal,False
416,YARWUN,Yarwun,QLD1,-23.830200,151.149692,Gas,False
417,YAWWF,Yawong,VIC1,-36.471022,143.361722,Wind,False


Save the cleaned dataset into a csv file.

In [65]:
save_dataset(facilities_df, "./DATA/EXTRACTED/electricity_facilities_matched.csv")

Saved: DATA\EXTRACTED\electricity_facilities_matched.csv


### <b> <span style="color:pink">1.4 Get All Power Generated and CO2 Emissions per Facility</span></b>


Facility data will return total data instead of facility specific data if we do not specify the facility code in tha API call. Hence, we need to pass the facility code we get from previous API call to this endpoint.

In [55]:
# get all facility code
FACILITY_LIST = facilities_df["facility_code"].tolist()
print(f"Total facilities: {len(FACILITY_LIST)}")

Total facilities: 419


However, since the parameter only accept 30 max characters and we have more than 500 facilities, passing all facility code at once will result in error. To get around that, we will use the batching strategy.

In [56]:
OUT_DIR = "./DATA/EXTRACTED"
os.makedirs(OUT_DIR, exist_ok=True)
ENDPOINT = "data/facilities/NEM"
batch_size = 5

# function chunk the facility code list into batches
def chunk_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n], i, min(i+n, len(lst))  

# batch retrieval
def batch_retrieval(batch_size:int):
    for batch, start, end in chunk_list(FACILITY_LIST, batch_size):
        batch_id = math.ceil(end/batch_size)
        cache_path = os.path.join(OUT_DIR, f"batch_{start+1:04d}_{end:04d}.json")

        if os.path.exists(cache_path):
            print(f"Batch {batch_id}: {start+1}-{end} already cached.")
            continue

        print(f"Fetching batch {batch_id}: facilities {start+1}–{end} ({batch})")

        # params
        params = {
            'network_code': 'NEM',
            'metrics': {'power', 'emissions'},
            'interval': '5m',
            "date_start": "2025-10-01",
            "date_end": "2025-10-08",
            "facility_code": {f for f in batch},
        }

        try:
            r = requests.get(f"{BASE_URL}{ENDPOINT}", headers=HEADERS, params=params, timeout=90)
            if r.status_code == 200:
                payload = r.json()
                with open(cache_path, "w") as f:
                    json.dump(payload, f, indent=2)
                print(f"Saved {cache_path}")
            else:
                print(f"HTTP {r.status_code}: {r.text[:150]}")
        except Exception as e:
            print(f"Batch {batch_id}: {e}")

        time.sleep(0.3)

# function to flatten the payload
def flatten(payload):
    rows = []
    for block in payload.get("data", []):
        metric   = block.get("metric")
        unit     = block.get("unit")
        interval = block.get("interval")
        for res in block.get("results", []):
            unit_code = (res.get("columns") or {}).get("unit_code")
            for ts, val in res.get("data", []):
                rows.append({"timestamp": ts, "unit_code": unit_code,
                            "metric": metric, "interval": interval, "unit": unit, "value": val})
    return rows


In [57]:
# Retrieve all facility data in batch of 5 per API call
batch_retrieval(5)

Fetching batch 1: facilities 1–5 (['AGLHAL', 'AGLSITA', 'APPIN', 'AVLSF', 'BAKING'])
Saved ./DATA/EXTRACTED\batch_0001_0005.json
Fetching batch 2: facilities 6–10 (['BALBESS', 'BANGOWF', 'BANNSP', 'BAPS', 'BARCALDN'])
Saved ./DATA/EXTRACTED\batch_0006_0010.json
Fetching batch 3: facilities 11–15 (['BARRON', 'BASTYAN', 'BBP31', 'BDONGHYD', 'BERWICK'])
Saved ./DATA/EXTRACTED\batch_0011_0015.json
Fetching batch 4: facilities 16–20 (['BERYLSF', 'BHB', 'BHWF', 'BIALAWF', 'BLAYNEY'])
Saved ./DATA/EXTRACTED\batch_0016_0020.json
Fetching batch 5: facilities 21–25 (['BNGSF1', 'BNGSF2', 'BOCOROCK', 'BODWF', 'BOLIVAR'])
Saved ./DATA/EXTRACTED\batch_0021_0025.json
Fetching batch 6: facilities 26–30 (['BOMENSF', 'BPLANDF', 'BROOKLYN', 'BRYB1WF1', 'BURRIN'])
Saved ./DATA/EXTRACTED\batch_0026_0030.json
Fetching batch 7: facilities 31–35 (['BUTLERSG', 'BWTR1', 'CALLIDEC1', 'CALL_B', 'CANUNDA1'])
Saved ./DATA/EXTRACTED\batch_0031_0035.json
Fetching batch 8: facilities 36–40 (['CAPBES', 'CATHROCK', 'CES

### <b> <span style="color:pink">1.5 Get Market Price and Demand Data</span></b>


- Market data was retrieved from the OpenElectricity API, focusing on two key metrics which are price and demand_energy at five-minute intervals for the NEM network.
- The data was then flattened and cleaned into a structured format containing timestamp, network region, metric, and value.
- The dataset was stored in wide formats to support integration and analysis.


In [89]:
# UTC window covers 1–7 Oct in Australia/Sydney (UTC+10) with respecting the API's 7-day limit for 5m interval

START_UTC = "2025-10-01"
END_UTC   = "2025-10-08"

market_df = fetch_market_data(
    network="NEM",
    start_utc=START_UTC,
    end_utc=END_UTC,
    metrics=("price", "demand_energy"),
    interval="5m",
    primary_grouping="network_region"
)

print("Raw market rows:", len(market_df))
print("Columns:", list(market_df.columns))
display(market_df.head())

# Cache raw pull
save_dataset(market_df, "./DATA/EXTRACTED/market_data_5m.csv")

Response status: 200
Response url: https://api.openelectricity.org.au/v4/market/network/NEM?metrics=price&metrics=demand_energy&interval=5m&date_start=2025-10-01&date_end=2025-10-08&primary_grouping=network_region&with_clerk=false
Raw market rows: 20160
Columns: ['timestamp', 'network_region', 'metric', 'value']


Unnamed: 0,timestamp,network_region,metric,value
0,2025-09-30 14:00:00+00:00,NSW1,price,56.98
1,2025-09-30 14:00:00+00:00,NSW1,demand_energy,0.5931
2,2025-09-30 14:00:00+00:00,QLD1,price,54.82
3,2025-09-30 14:00:00+00:00,QLD1,demand_energy,0.5015
4,2025-09-30 14:00:00+00:00,SA1,price,8.11


Saved: DATA\EXTRACTED\market_data_5m.csv


## <b> <span style="color:orange">2. Data Integration and Caching</span></b>


For power and emission data per facility, we need to perform some pre-processing to store them into a cached csv file. Specifically, for this process we need to:
1. Combine all cached .json data of into one dataframe.
2. Sum the facilitiy data to get total power and emissions per facility (some facilities have more than one units).
3. Append additional information to each facility (e.g. lat, lon, facility_name, etc).

In [95]:
# combine all json cache into one dataframe
records = []
for path in glob.glob("./DATA/EXTRACTED/*.json"):
    payload = json.load(open(path))
    
    records.extend(flatten(payload))

series_df = pd.DataFrame(records)

lookup = pd.read_csv("./DATA/EXTRACTED/electricity_units_facilities.csv")[["unit_code","facility_code"]]
series_df = series_df.merge(lookup, on="unit_code", how="left")
facility_df = (series_df.groupby(["timestamp","facility_code","metric"], as_index=False)["value"].sum())

facilities_df = pd.read_csv("./DATA/EXTRACTED/electricity_facilities_matched.csv")[
    ["facility_code", "facility_name", "network_region", "lat", "lng", "fuel_type"]
]
facility_df = facility_df.merge(facilities_df, on="facility_code", how="left")

# reorder columns for clarity
facility_df = facility_df[
    [
        "timestamp", "facility_code", "facility_name", "network_region", "fuel_type",
        "lat", "lng", "metric", "value"
    ]
]


In [101]:
pivot_df = (
    facility_df
    .pivot_table(
        index=[
            "timestamp", "facility_code", "facility_name",
            "network_region", "fuel_type", "lat", "lng"
        ],
        columns="metric",
        values="value"
    )
    .reset_index()
)

pivot_df.columns.name = None  # remove 'metric' label
pivot_df = pivot_df.rename_axis(None, axis=1)

In [102]:
pivot_df["timestamp"] = pd.to_datetime(pivot_df["timestamp"])
pivot_df.dtypes

timestamp         datetime64[ns, UTC+10:00]
facility_code                        object
facility_name                        object
network_region                       object
fuel_type                            object
lat                                 float64
lng                                 float64
emissions                           float64
power                               float64
dtype: object

Next, we append the matket price and demand to each row.

In [103]:
# Market Price and Demand by Facility 
market_df = pd.read_csv("./DATA/EXTRACTED/market_data_5m.csv")

# Standardise & make local timestamp
mkt = market_df.copy()
mkt["timestamp"] = pd.to_datetime(mkt["timestamp"], utc=True, errors="coerce")
mkt["timestamp"] = mkt["timestamp"].dt.tz_convert("Australia/Sydney")

# Pivot by timestamp_utc, region to price and demand_energy
market_wide = (
    mkt.pivot_table(index=["timestamp", "network_region"],
                    columns="metric",
                    values="value",
                    aggfunc="first")
       .reset_index()
)

market_wide

metric,timestamp,network_region,demand_energy,price
0,2025-10-01 00:00:00+10:00,NSW1,0.5931,56.98
1,2025-10-01 00:00:00+10:00,QLD1,0.5015,54.82
2,2025-10-01 00:00:00+10:00,SA1,0.1309,8.11
3,2025-10-01 00:00:00+10:00,TAS1,0.0749,0.12
4,2025-10-01 00:00:00+10:00,VIC1,0.4095,8.95
...,...,...,...,...
10055,2025-10-08 00:55:00+11:00,NSW1,0.5814,94.22
10056,2025-10-08 00:55:00+11:00,QLD1,0.5143,65.83
10057,2025-10-08 00:55:00+11:00,SA1,0.1144,220.66
10058,2025-10-08 00:55:00+11:00,TAS1,0.0771,0.06


In [104]:
df_merged = pivot_df.merge(
    market_wide,
    left_on=['network_region','timestamp'],
    right_on=['network_region','timestamp'],
    how='left'
)

In [105]:
df_merged

Unnamed: 0,timestamp,facility_code,facility_name,network_region,fuel_type,lat,lng,emissions,power,demand_energy,price
0,2025-10-01 00:00:00+10:00,0MREH,Melbourne A1,VIC1,Battery,-37.661274,144.726302,0.0000,0.00,0.4095,8.95
1,2025-10-01 00:00:00+10:00,0MREHA2,Melbourne A2,VIC1,Battery,-37.663934,144.726927,0.0000,0.00,0.4095,8.95
2,2025-10-01 00:00:00+10:00,0TARONGBESS,Tarong,QLD1,Battery,-26.780051,151.912068,0.0000,0.00,0.5015,54.82
3,2025-10-01 00:00:00+10:00,0WAMBOWF,Wambo,QLD1,Wind,-26.603045,151.246876,0.0000,65.23,0.5015,54.82
4,2025-10-01 00:00:00+10:00,ADP,Adelaide Desalination,SA1,Solar,-35.096948,138.484061,0.0000,0.00,0.1309,8.11
...,...,...,...,...,...,...,...,...,...,...,...
668348,2025-10-07 23:55:00+10:00,YARANSF,Yarranlea Solar Farm,QLD1,Solar,-27.708939,151.532696,0.0000,0.00,0.5143,65.83
668349,2025-10-07 23:55:00+10:00,YARWUN,Yarwun,QLD1,Gas,-23.830200,151.149692,7.1024,138.99,0.5143,65.83
668350,2025-10-07 23:55:00+10:00,YATSF1,YATPOOL SOLAR FARM,VIC1,Solar,-34.380730,142.205340,0.0000,0.00,0.3830,82.83
668351,2025-10-07 23:55:00+10:00,YENDONWF,Yendon,VIC1,Wind,-37.630952,144.022463,0.0000,89.87,0.3830,82.83


In [106]:
# save to csv
save_dataset(df_merged, "DATA/EXTRACTED/consolidated_facilities_cleaned.csv")

Saved: DATA\EXTRACTED\consolidated_facilities_cleaned.csv


## <b> <span style="color:orange">3. Data Publishing via MTQQ</span></b>


In [107]:
import mtqq_publisher as publisher
importlib.reload(publisher)

<module 'mtqq_publisher' from 'd:\\Rengga\\venv\\DE\\Assignment 2_Tut07_G04\\mtqq_publisher.py'>

In [108]:
df = pd.read_csv("DATA/EXTRACTED/consolidated_facilities_cleaned.csv")

publisher.publish_via_mqtt_broker(df)

Starting continuous data stream publisher...
Connected with result code Success
Ready to publish to topic: COMP5339/T07G04/facilities

Publishing 2025-10-01 data...

[PUBLISHED] 0MREH @ 2025-10-01 00:00:00+10:00
[PUBLISHED] PORTLCN @ 2025-10-01 00:00:00+10:00
[PUBLISHED] PIONEER @ 2025-10-01 00:00:00+10:00
[PUBLISHED] PIBESS @ 2025-10-01 00:00:00+10:00
[PUBLISHED] PAREPW @ 2025-10-01 00:00:00+10:00
[PUBLISHED] PALOONA @ 2025-10-01 00:00:00+10:00
[PUBLISHED] OSBORNE @ 2025-10-01 00:00:00+10:00
[PUBLISHED] OAKLAND @ 2025-10-01 00:00:00+10:00
[PUBLISHED] PORTWF @ 2025-10-01 00:00:00+10:00
[PUBLISHED] OAKEY2SF @ 2025-10-01 00:00:00+10:00
[PUBLISHED] OAKEY @ 2025-10-01 00:00:00+10:00
[PUBLISHED] NYNGAN @ 2025-10-01 00:00:00+10:00
[PUBLISHED] NUMURKSF @ 2025-10-01 00:00:00+10:00
[PUBLISHED] NPPPPS @ 2025-10-01 00:00:00+10:00
[PUBLISHED] NEWPORT @ 2025-10-01 00:00:00+10:00
[PUBLISHED] NEWENSF @ 2025-10-01 00:00:00+10:00
[PUBLISHED] NEVERSF @ 2025-10-01 00:00:00+10:00
[PUBLISHED] OAKEY1SF @ 20