In [100]:
import requests
import pandas as pd
from tqdm import tqdm 
from bs4 import BeautifulSoup
import os
import mimetypes
import numpy as np

# plot
import plotly.express as px
import plotly.graph_objects as go

# selenium method
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pyperclip
import time

# analysis
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder


# duckdb
import duckdb

##### Class Download File

In [126]:
# ------------------------------------- Function List ----------------------------------------


class download_file:
    def status_bar_api(self, api_url, csv_filename = None, status = 'api'):
        # Stream the download
        if status == 'api':
            with requests.get(api_url, stream=True) as response:
                response.raise_for_status()
                total_size = int(response.headers.get('content-length', 0))
                chunk_size = 1024 * 1024  # 1 MB chunks
                
                chunks = []
                with tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading') as pbar:
                    for chunk in response.iter_content(chunk_size=chunk_size):
                        if chunk:
                            chunks.append(chunk)
                            pbar.update(len(chunk))
                
                # Combine chunks into a single bytes object
                content = b''.join(chunks)
            
            # Save CSV if filename is provided
            if csv_filename:
                with open(csv_filename, 'wb') as f:
                    f.write(content)
        else:
            response = requests.get(api_url, stream=True)
            response.raise_for_status()

            # Determine filename
            if "Content-Disposition" in response.headers:
                content_disposition = response.headers["Content-Disposition"]
                filename = content_disposition.split("filename=")[-1].strip('"')
            else:
                filename = os.path.basename(api_url)

            # If filename has no extension, try to guess from Content-Type
            if "." not in filename:
                content_type = response.headers.get("Content-Type", "")
                extension = mimetypes.guess_extension(content_type.split(";")[0].strip())
                if extension:
                    filename += extension

            # Get total file size for progress bar (in bytes)
            total_size = int(response.headers.get("content-length", 0))
            chunk_size = 8192  # 8 KB per chunk

            # Download with progress bar
            with open(filename, "wb") as f, tqdm(
                total=total_size, unit='B', unit_scale=True, desc=filename
            ) as progress_bar:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        progress_bar.update(len(chunk))
                

    # download from cer 
    def download_from_cer(self, url, csv_filename=None):
        """Extract API URL from CER datasets using Selenium and download CSV."""
        driver = webdriver.Chrome()
        driver.get(url)
        wait = WebDriverWait(driver, 15)

        # Find and click "Copy API URL" button
        button = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, "//button[.//span[contains(text(), 'Copy API URL')]]")
            )
        )
        button.click()
        time.sleep(1)  # wait for clipboard update

        api_url = pyperclip.paste()
        driver.quit()

        cer_code = url.split("/")[-1]
        api_url = f"https://api.cer.gov.au/datahub-public/v1/api/Dataset/NGER/dataset/{cer_code}.csv"
        print("Downloading from:", api_url)

        csv_filename = f"{cer_code}.csv"
        return self.status_bar_api(api_url=api_url, csv_filename=csv_filename)

    def download_cer_markets(self, url):
        url_header = url.split('/')[2]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
            # Step 2: Find the button/link with the XLSX file
        div_tags = soup.find_all("div", class_="cer-accordion__body__item")
        for div in div_tags:
            a_tag = div.find("a", href=True)
            if a_tag:
                text = a_tag.get_text(strip=True).lower()  # normalize text
                if "csv" in text:
                    file_href = a_tag["href"]
                    full_url = f"https://www.{url_header}{file_href}"
                    #print(full_url)
                    self.status_bar_api(api_url=full_url,csv_filename=None, status='file')
    
    def download_abs(self, url):
        url_header = url.split('/')[2]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find the link for "Population and people"
        target_div = None
        for div in soup.find_all("div", class_="file-description-link-formatter"):
            h4 = div.find("h4")
            if h4 and "Economy and industry" in h4.text:
                target_div = div
                break
        if target_div:
            a_tag = target_div.find("a", href=True)
            relative_url = a_tag['href']
            download_url = f"https://{url_header}" + relative_url
            print("Found download URL:", download_url)
            self.status_bar_api(api_url=download_url,csv_filename=None, status='file')


<h1>Retrieve Data Set </h1>

In [127]:

download = download_file()

# first dataset
print("Processing Download First Dataset ....")
cer_url = "https://data.cer.gov.au/datasets/NGER/ID0243"
download.download_from_cer(url=cer_url)
# # second dataset
print("Processing Download Second Dataset ....")
cer_markets_url = "https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data"
download.download_cer_markets(url=cer_markets_url)
# third dataset
print("Processing Download Third Dataset ....")
abs_url = "https://www.abs.gov.au/methodologies/data-region-methodology/2011-24#data-downloads"
download.download_abs(url=abs_url)


Processing Download First Dataset ....
Downloading from: https://api.cer.gov.au/datahub-public/v1/api/Dataset/NGER/dataset/ID0243.csv


Downloading: 100%|██████████| 83.3k/83.3k [00:00<00:00, 12.3MB/s]

Processing Download Second Dataset ....



total-lgcs-and-capacity-accredited-power-stations-2025.csv: 100%|██████████| 421/421 [00:00<00:00, 2.98MB/s]
power-stations-and-projects-accredited.csv: 100%|██████████| 25.8k/25.8k [00:00<00:00, 25.8MB/s]
power-stations-and-projects-committed.csv: 100%|██████████| 1.89k/1.89k [00:00<00:00, 7.78MB/s]
power-stations-and-projects-probable.csv: 100%|██████████| 2.22k/2.22k [00:00<00:00, 8.34MB/s]
total-lgcs-rec-registry-0.csv: 100%|██████████| 45.4k/45.4k [00:00<00:00, 9.14MB/s]


Processing Download Third Dataset ....
Found download URL: https://www.abs.gov.au/methodologies/data-region-methodology/2011-24/14100DO0003_2011-24.xlsx


14100DO0003_2011-24.xlsx: 100%|██████████| 19.7M/19.7M [00:03<00:00, 5.82MB/s]


<h1>Data Integration and Cleaning Data </h1>

In [None]:
class cleaningData:
    def __init__(self):
        pass

    def show_null_data(self, data):
        null_count = data.isna().sum()
        null_percent = data.isna().sum() / len(data)
        null_percent = null_percent.apply(lambda x: f"{x:.1%}")
        print("Total Duplicated Count: ", data.duplicated().sum())
        results = pd.concat([null_count, null_percent], axis=1)
        results.columns = ['Null Total Count', 'Null Percentage']
        return results
    
    def split_categorical_numerical(self, data):
        numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
        categorical_cols = data.select_dtypes(include=['object', 'category']).columns
        
        return numerical_cols, categorical_cols

class connectDB:
    def __init__(self):
        self.connection =  duckdb.connect('my_database.duckdb') 
    
    def create_new_table(self):
        


In [121]:

# first url file
df_nger = pd.read_csv("ID0243.csv")

# second url file
df_cer_1_approved = pd.read_csv("power-stations-and-projects-accredited.csv")
df_cer_1_commited = pd.read_csv("power-stations-and-projects-committed.csv")
df_cer_1_probable = pd.read_csv("power-stations-and-projects-probable.csv")


# third url file
df_abs = pd.ExcelFile("14100DO0003_2011-24.xlsx")
statistical_area = pd.read_excel(df_abs, 'Table 1', header=[5,6])
lga_area = pd.read_excel(df_abs, 'Table 2', header=[5,6])





#### Cleaning the NGER.CSV dataset and put it into DB

In [None]:
cleaning_data = cleaningData()

df_nger_numerical, df_nger_categorical = cleaning_data.split_categorical_numerical(df_nger) 
df_nger.replace('-', np.nan, inplace=True)

df_nger_cleaning = df_nger.drop(columns=['Electricity production GJ', 'Total emissions t CO2 e',''])
df_nger_cleaning.fillna("Unknown", inplace=True)

cleaning_data.show_null_data(df_nger_cleaning)


# Automatically read CSV and create a table

con.execute("""
CREATE TABLE my_table AS
SELECT
    row_number() OVER () AS id,
    *
FROM read_csv_auto('ID0243.csv')
""")

con.close()

Total Duplicated Count:  0


  df_nger_cleaning.fillna("Unknown", inplace=True)


In [125]:

print("----------")
df_cer_1_approved.replace('-', np.nan, inplace=True)

con.execute("""
CREATE TABLE power_stations AS
SELECT
    row_number() OVER () AS id,
    *
FROM read_csv_auto('power-stations-and-projects-accredited.csv')
""")

con.close()


----------


ConnectionException: Connection Error: Connection already closed!

In [None]:
print("----------")
#show_null_data(df_cer_1_commited)

----------


In [None]:
print("----------")
#show_null_data(df_cer_1_probable)



----------


In [None]:
print("----------")
#show_null_data(df_cer_2)

----------


In [None]:
#show_null_data(df_cer_3)

In [None]:

print("----------")
#show_null_data(statistical_area)


----------


In [None]:
print("----------")
#show_null_data(lga_area)

----------
