# <b> <span style="color:white">Electricity Sector Data Integration & Augmentation</span></b>


# <b> <span style="color:white">GROUP 04</span></b>


| Name                   | SID       | Unikey   |
| ---------------------- | --------- | -------- |
| Putu Eka Udiyani Putri | 550067302 | pput0940 |
| Rengga Firmandika      | 550126632 | rfir0117 |
| Vincentius Ansel Suppa | 550206406 | vsup0468 |


## <b> <span style="color:orange">1. Data Acquisition</span></b>


### <b> <span style="color:pink">a. National Greenhouse and Energy Reporting (NGER)</span></b>


In [15]:
import re
import requests
from pathlib import Path


BASE = "https://api.cer.gov.au/datahub-public/v1"
SCHEME = "NGER"
HEADERS = {"Accept": "application/json", "User-Agent": "Mozilla/5.0"}

OUT = Path("DATA1_ELECTRICITY"); OUT.mkdir(parents=True, exist_ok=True)


In [16]:
# helper
def get_json(path: str):
    url = BASE + path
    r = requests.get(url, headers=HEADERS, timeout=60)
    r.raise_for_status()
    return r.json()

In [17]:
# List schemes
schemes = get_json("/api/Schemes")
nger = next((s for s in schemes if (s.get("id") or "").upper() == SCHEME), None)
if not nger:
    raise SystemExit("Scheme NGER not found in /api/Schemes")

# Get all DatasetCatalogItems for NGER
items = get_json(f"/api/Schemes/{SCHEME}/DatasetCatalogItems")
print(f"Total catalog items under {SCHEME}: {len(items)}")

# Filter only electricity sector datasets 2014–2024
PHRASE = "greenhouse and energy information by designated generation facility"
YEARS = [f"{y}-{str(y+1)[-2:]}" for y in range(2014, 2024)]
YEARS_SET = set(YEARS)

targets = []
for it in items:
    cid   = str(it.get("id") or "")
    title = str(it.get("displayName") or "").lower()
    if PHRASE in title:
        m = re.search(r"(20\d{2}[–-]\d{2})", title)
        fy = m.group(1).replace("–","-") if m else ""
        if fy in YEARS_SET:
            targets.append((fy, cid))

# Sort chronologically
def fy_key(fy: str):
    a, b = fy.split("-")
    return (int(a), int(b))
targets.sort(key=lambda t: fy_key(t[0]))

print("\nTargets discovered:")
for fy, cid in targets:
    print(f"  {fy}  {cid}")

Total catalog items under NGER: 112

Targets discovered:
  2014-15  ID0075
  2015-16  ID0076
  2016-17  ID0077
  2017-18  ID0078
  2018-19  ID0079
  2019-20  ID0080
  2020-21  ID0081
  2021-22  ID0082
  2022-23  ID0083
  2023-24  ID0243


In [18]:
# Download raw files
downloaded = 0
for fy, cid in targets:
    ok = False
    for ext in ["csv", "xlsx"]:
        url = f"{BASE}/api/Dataset/{SCHEME}/dataset/{cid}.{ext}"
        try:
            r = requests.get(url, headers=HEADERS, timeout=120)
            if r.ok and len(r.content) > 500:  # simple sanity check
                out = OUT / f"nger_{fy}.{ext}"
                out.write_bytes(r.content)
                print(f"[OK] {fy} -> {out.name} ({len(r.content)/1024:.1f} KB)")
                downloaded += 1
                ok = True
                break
            else:
                print(f"[TRY] {fy} no {ext.upper()} (status {r.status_code})")
        except Exception as e:
            print(f"[ERR] {fy} {ext.upper()}: {e}")
    if not ok:
        print(f"[WARN] {fy} ({cid}) no usable file")

print(f"\nDone. Downloaded {downloaded}/{len(targets)} files into {OUT.resolve()}")

[OK] 2014-15 -> nger_2014-15.csv (48.0 KB)
[OK] 2015-16 -> nger_2015-16.csv (48.5 KB)
[OK] 2016-17 -> nger_2016-17.csv (52.0 KB)
[OK] 2017-18 -> nger_2017-18.csv (55.2 KB)
[OK] 2018-19 -> nger_2018-19.csv (63.1 KB)
[OK] 2019-20 -> nger_2019-20.csv (66.6 KB)
[OK] 2020-21 -> nger_2020-21.csv (69.9 KB)
[OK] 2021-22 -> nger_2021-22.csv (73.2 KB)
[OK] 2022-23 -> nger_2022-23.csv (73.7 KB)
[OK] 2023-24 -> nger_2023-24.csv (81.4 KB)

Done. Downloaded 10/10 files into /Users/ekaudiyani/Documents/KULIAH/SEMESTER 2/2. Data Engineering/Assignment_1/DATA1_ELECTRICITY


### <b> <span style="color:pink">b. Clean Energy Regulator (CER)</span></b>


In [None]:
# import time
# import re
# from pathlib import Path
# import pandas as pd

# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.common.action_chains import ActionChains
# from selenium.webdriver.support.ui import WebDriverWait, Select
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
# from webdriver_manager.chrome import ChromeDriverManager

# URL = "https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data"

# # Output folder
# OUT_DIR = Path("/Users/ekaudiyani/Documents/KULIAH/SEMESTER 2/2. Data Engineering/Assignment_1/DATA2_LRET")
# OUT_DIR.mkdir(parents=True, exist_ok=True)

# # -------------------------
# # Selenium setup (Headless)
# # -------------------------
# def make_driver():
#     from selenium.webdriver.chrome.options import Options
#     options = Options()
#     options.add_argument("--headless=new")
#     options.add_argument("--no-sandbox")
#     options.add_argument("--disable-gpu")
#     options.add_argument("--window-size=1600,2400")
#     options.add_argument("--disable-dev-shm-usage")
#     options.add_argument("--lang=en-US")
#     options.add_argument(
#         "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
#         "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
#     )
#     service = Service(ChromeDriverManager().install())
#     return webdriver.Chrome(service=service, options=options)

# # -------------------------
# # Helpers
# # -------------------------
# def norm(s: str) -> str:
#     if s is None:
#         return ""
#     s = s.replace("\xa0", " ")
#     return re.sub(r"\s+", " ", s).strip()

# def scroll_into_view(driver, el):
#     try:
#         driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", el)
#         time.sleep(0.2)
#     except Exception:
#         pass

# def find_table_after_heading(driver, heading_text: str):
#     xpath_heading = (
#         "//*[self::h1 or self::h2 or self::h3 or self::h4]"
#         f"[contains(translate(normalize-space(.),"
#         " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),"
#         f" '{heading_text.lower()}')]"
#     )
#     heading = WebDriverWait(driver, 25).until(
#         EC.presence_of_element_located((By.XPATH, xpath_heading))
#     )
#     scroll_into_view(driver, heading)
#     table = heading.find_element(By.XPATH, "following::table[1]")
#     scroll_into_view(driver, table)
#     return table

# def extract_headers_from_table(table_el):
#     ths = table_el.find_elements(By.CSS_SELECTOR, "thead th")
#     headers = [norm(th.text) for th in ths]
#     if not headers:
#         first_row_tds = table_el.find_elements(By.CSS_SELECTOR, "tbody tr:first-child td")
#         if first_row_tds:
#             headers = [f"col_{i+1}" for i in range(len(first_row_tds))]
#     return headers

# def try_set_length_near_table(table_el):
#     candidates = []
#     try:
#         candidates += table_el.find_elements(By.XPATH, "following::select[contains(@name,'length')][1]")
#         candidates += table_el.find_elements(By.XPATH, "following::div[contains(@class,'length')]//select[1]")
#     except Exception:
#         pass
#     if not candidates:
#         try:
#             candidates = table_el.find_elements(By.XPATH, "ancestor::div[1]//select")
#         except Exception:
#             pass
#     for sel in candidates:
#         try:
#             for want in ["All", "100", "200", "250", "500"]:
#                 for opt in sel.find_elements(By.TAG_NAME, "option"):
#                     if want.lower() in norm(opt.text).lower():
#                         Select(sel).select_by_visible_text(opt.text)
#                         time.sleep(1)
#                         return True
#         except Exception:
#             continue
#     return False

# def collect_all_rows_with_pagination(driver, table_el, max_pages=50):
#     headers = extract_headers_from_table(table_el)

#     def read_page_rows():
#         rows = []
#         for r in table_el.find_elements(By.CSS_SELECTOR, "tbody tr"):
#             tds = r.find_elements(By.CSS_SELECTOR, "td")
#             if not tds:
#                 continue
#             rows.append([norm(td.text) for td in tds])
#         return rows

#     # Step 1: try All/large page length
#     changed = try_set_length_near_table(table_el)
#     rows = read_page_rows()
#     if changed:
#         return headers, rows

#     # Step 2: paginate with Next
#     all_rows = []
#     all_rows.extend(rows)

#     def first_cell_text():
#         try:
#             return table_el.find_element(By.CSS_SELECTOR, "tbody tr td").text
#         except Exception:
#             return ""

#     for _ in range(max_pages):
#         before = norm(first_cell_text())
#         next_btn = None
#         for xpath in [
#             ".//following::a[contains(.,'Next')][1]",
#             ".//following::button[contains(.,'Next')][1]",
#             ".//following::li[contains(@class,'next')]/a[1]",
#             ".//following::a[@aria-label='Next'][1]",
#         ]:
#             try:
#                 candidate = table_el.find_element(By.XPATH, xpath)
#                 if candidate.is_displayed():
#                     next_btn = candidate
#                     break
#             except Exception:
#                 continue
#         if not next_btn:
#             break
#         if "disabled" in (next_btn.get_attribute("class") or "").lower():
#             break

#         scroll_into_view(driver, next_btn)
#         try:
#             ActionChains(driver).move_to_element(next_btn).click().perform()
#         except:
#             next_btn.click()

#         try:
#             WebDriverWait(driver, 10).until(
#                 lambda d: norm(first_cell_text()) != before
#             )
#         except TimeoutException:
#             break

#         all_rows.extend(read_page_rows())

#     return headers, all_rows

# def to_dataframe(headers, rows):
#     max_cols = max(len(headers), max((len(r) for r in rows), default=0))
#     if len(headers) < max_cols:
#         headers = headers + [f"col_{i}" for i in range(len(headers) + 1, max_cols + 1)]
#     rows = [r + [""] * (max_cols - len(r)) for r in rows]
#     df = pd.DataFrame(rows, columns=headers)
#     return df.dropna(how="all")

# def smart_rename(df):
#     mapping = {}
#     for c in df.columns:
#         cl = norm(c).lower()
#         if "project" in cl and "name" in cl:
#             mapping[c] = "Project Name"
#         elif cl == "state" or "state" in cl:
#             mapping[c] = "State"
#         elif ("mw" in cl and "capacity" in cl) or cl == "mw":
#             mapping[c] = "MW Capacity"
#         elif "fuel" in cl:
#             mapping[c] = "Fuel Source"
#         elif "committed" in cl and ("date" in cl or "month" in cl or "year" in cl):
#             mapping[c] = "Committed Date (Month/Year)"
#     return df.rename(columns=mapping)

# def scrape_tables():
#     driver = make_driver()
#     try:
#         driver.get(URL)
#         WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
#         time.sleep(1)

#         # Committed
#         committed_table = find_table_after_heading(driver, "Committed power stations")
#         headers_c, rows_c = collect_all_rows_with_pagination(driver, committed_table)
#         df_committed = to_dataframe(headers_c, rows_c)
#         df_committed = smart_rename(df_committed)

#         # Probable
#         probable_table = find_table_after_heading(driver, "Probable power stations")
#         headers_p, rows_p = collect_all_rows_with_pagination(driver, probable_table)
#         df_probable = to_dataframe(headers_p, rows_p)
#         df_probable = smart_rename(df_probable)

#         return df_committed, df_probable
#     finally:
#         driver.quit()

# def main():
#     df_committed, df_probable = scrape_tables()

#     print("Committed shape:", df_committed.shape)
#     print("Probable  shape:", df_probable.shape)

#     # Save only CSVs in target folder
#     committed_path = OUT_DIR / "committed_power_stations.csv"
#     probable_path = OUT_DIR / "probable_power_stations.csv"

#     df_committed.to_csv(committed_path, index=False)
#     df_probable.to_csv(probable_path, index=False)

#     print("Saved:", committed_path)
#     print("Saved:", probable_path)

# if __name__ == "__main__":
#     main()


Committed shape: (25, 5)
Probable  shape: (49, 4)
Saved: /Users/ekaudiyani/Documents/KULIAH/SEMESTER 2/2. Data Engineering/Assignment_1/DATA2_LRET/committed_power_stations.csv
Saved: /Users/ekaudiyani/Documents/KULIAH/SEMESTER 2/2. Data Engineering/Assignment_1/DATA2_LRET/probable_power_stations.csv


In [None]:
import time
from pathlib import Path
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC

# Function to scrape the data
def scrape_CER_data(table_id: str):
    url = "https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data"

    from selenium.webdriver.chrome.options import Options
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--window-size=2000,1200")
    opts.add_argument("user-agent=Mozilla/5.0")

    driver = webdriver.Chrome(options=opts)
    wait = WebDriverWait(driver, 25)

    try:
        driver.get(url)

        # Wait until the table is rendered
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f"#{table_id}")))
        
        # Will be faster if we show max rows first
        length_sel = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f"select[name='{table_id}_length']")))
        Select(length_sel).select_by_visible_text("100")
        
        # wait for redraw
        time.sleep(2)
        try:
            wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".dataTables_processing")))
        except:
            pass

        # Extract headers and rows
        headers = [th.get_attribute("textContent").strip() for th in driver.find_elements(By.CSS_SELECTOR, f"#{table_id} thead th")]
        rows = []

        # Handle pagination if max rows does not show all data
        while True:
            for tr in driver.find_elements(By.CSS_SELECTOR, f"#{table_id} tbody tr"):
                tds = [td.get_attribute("textContent").strip() for td in tr.find_elements(By.CSS_SELECTOR, "td")]
                if tds:
                    rows.append(tds)

            next_button = driver.find_element(By.CSS_SELECTOR, f"#{table_id}_wrapper button[data-dt-idx='next']")

            if "disabled" in next_button.get_attribute("class"):
                break
            
            # Scroll to the next button first to ensure that it's clickable
            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  
            
            # Click the button
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(2)

        return pd.DataFrame(rows, columns=headers if headers else None)
    finally:
        driver.quit()

# Function to save the scraped data
def save_CER_table(table_id: str, out_csv_path: str):
	out_path = Path(out_csv_path)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	df = scrape_CER_data(table_id)
	df.to_csv(out_path, index=False)
	print(f"Saved: {out_path}")

In [19]:
save_CER_table("DataTables_Table_0", "DATA2/approved_power_stations.csv")
save_CER_table("DataTables_Table_1", "DATA2/committed_power_stations.csv")
save_CER_table("DataTables_Table_2", "DATA2/probable_power_stations.csv")

Saved: DATA2\approved_power_stations.csv
Saved: DATA2\committed_power_stations.csv
Saved: DATA2\probable_power_stations.csv


### <b> <span style="color:pink">c. Australian Bureau of Statistics (ABS)</span></b>


In [29]:
import requests
from pathlib import Path

# Target folder
OUT_DIR = Path("/Users/ekaudiyani/Documents/KULIAH/SEMESTER 2/2. Data Engineering/Assignment_1/DATA3_ABS")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ABS file links
files = {
    "Population_and_people.xlsx": "https://www.abs.gov.au/methodologies/data-region-methodology/2011-24/14100DO0001_2011-24.xlsx",
    "Economy_and_industry.xlsx": "https://www.abs.gov.au/methodologies/data-region-methodology/2011-24/14100DO0003_2011-24.xlsx",
}

# Download loop
for fname, url in files.items():
    out_path = OUT_DIR / fname
    print(f"Downloading {fname} ...")
    r = requests.get(url)
    r.raise_for_status()  # stop if error
    with open(out_path, "wb") as f:
        f.write(r.content)
    print(f"Saved to {out_path}")

print("All files downloaded successfully.")

Downloading Population_and_people.xlsx ...
Saved to /Users/ekaudiyani/Documents/KULIAH/SEMESTER 2/2. Data Engineering/Assignment_1/DATA3_ABS/Population_and_people.xlsx
Downloading Economy_and_industry.xlsx ...
Saved to /Users/ekaudiyani/Documents/KULIAH/SEMESTER 2/2. Data Engineering/Assignment_1/DATA3_ABS/Economy_and_industry.xlsx
All files downloaded successfully.


## <b> <span style="color:orange">2. Data Cleaning & Integration</span></b>


Note:  
Combine the retrieved data into a single, consolidated database.  
During this process, you may need to clean and pre-process the data to ensure consistency and reliability.  
Tasks may include handling missing values, converting data types, and filtering out irrelevant or inconsistent data.


### <b> <span style="color:pink">2.1 Data Cleaning</span></b>


#### <b> <span style="color:white">a. NGER Data</span></b>


#### <b> <span style="color:white">b. CER Data</span></b>


#### <b> <span style="color:white">c. ABS Data</span></b>


### <b> <span style="color:pink">2.2 Data Integration</span></b>


## <b> <span style="color:orange">3. Data Augmentation</span></b>


Note:  
Augment your integrated dataset about large-scale power stations with their geo-location by programmatically querying the geographic coordinates  
using a public geocoding API (such as Google Maps or OpenStreetMap/Nominatim) for all the energy facilities present.  
Document methods and API usage.


## <b> <span style="color:orange">4. Data Transformation and Storage</span></b>


Transform the processed and augmented data into a structured format suitable for analysis and visualization.  
Specifically, you should:  
• design a suitable database schema for storage in database, and  
• implement this schema and store your data in either DuckDB or a PostgreSQL database.

Whichever system you choose to install, make sure you include the spatial extensions so that we can run some spatial queries in Assignment 2.  
This should be straight-forward for DuckDB, but when choosing PostgreSQL,  
make sure PostGIS is included in the chosen install package.

Important Note: Clearly justify your database design decisions (e.g., normalized or deformalized schema) in your project report.  
If your group encounters significant difficulties working with a database, you may alternatively store your data in separate CSV files;  
however, choosing CSV storage will result in a mark penalty.


In [None]:
import os
import time
from pathlib import Path
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC

CER_URL = "https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data"


def _make_driver():
	from selenium.webdriver.chrome.options import Options
	opts = Options()
	opts.add_argument("--headless=new")
	opts.add_argument("--no-sandbox")
	opts.add_argument("--disable-gpu")
	opts.add_argument("--window-size=1600,1200")
	opts.add_argument("--disable-dev-shm-usage")
	opts.add_argument("user-agent=Mozilla/5.0")
	# Selenium Manager will fetch a matching driver automatically (Selenium >= 4.6)
	return webdriver.Chrome(options=opts)


def scrap_CER_data(table_id: str) -> pd.DataFrame:
	"""Scrape a CER DataTables table by its DOM id (e.g., 'DataTables_Table_1')."""
	driver = _make_driver()
	wait = WebDriverWait(driver, 25)
	try:
		driver.get(CER_URL)
		# Wait for the table
		table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f"#{table_id}")))
		# Try to switch to 'All' if available; else fall back to largest option
		try:
			length_sel = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f"select[name='{table_id}_length']")))
			try:
				Select(length_sel).select_by_visible_text("All")
			except Exception:
				# choose the largest numeric option
				opts = [(o.text.strip(), o) for o in length_sel.find_elements(By.TAG_NAME, "option")]
				nums = [(int(t), o) for t, o in opts if t.isdigit()]
				if nums:
					Select(length_sel).select_by_visible_text(str(max(nums)[0]))
			time.sleep(2)
		except Exception:
			pass

		# Headers
		headers = [th.text.strip() for th in table.find_elements(By.CSS_SELECTOR, "thead th")] or None
		# Rows
		rows = []
		for tr in table.find_elements(By.CSS_SELECTOR, "tbody tr"):
			tds = [td.text.strip() for td in tr.find_elements(By.TAG_NAME, "td")]
			if tds:
				rows.append(tds)
		return pd.DataFrame(rows, columns=headers)
	finally:
		driver.quit()


def save_cer_table(table_id: str, out_csv_path: str) -> pd.DataFrame:
	"""Scrape by table id and save to CSV. Returns the DataFrame."""
	out_path = Path(out_csv_path)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	df = scrap_CER_data(table_id)
	df.to_csv(out_path, index=False)
	print(f"Saved: {out_path}")
	return df

# Save the requested tables
_ = save_cer_table("DataTables_Table_1", "DATA2/committed_power_stations.csv")
_ = save_cer_table("DataTables_Table_2", "DATA2/probable_power_stations.csv")
