In [2]:
import requests
import pandas as pd
from tqdm import tqdm 
from bs4 import BeautifulSoup
import os
import mimetypes
import numpy as np
import re

# plot
import plotly.express as px
import plotly.graph_objects as go

# selenium method
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pyperclip
import time

# analysis
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer


# duckdb
import duckdb
from collections import defaultdict


##### Class Download File

In [None]:
class download_file:
    def status_bar_api(self, api_url:str, csv_filename:str = None, status:str = 'api'):
        # Stream the download
        if status == 'api':
            with requests.get(api_url, stream=True) as response:
                response.raise_for_status()
                total_size = int(response.headers.get('content-length', 0))
                chunk_size = 1024 * 1024  # 1 MB chunks
                
                chunks = []
                with tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading') as pbar:
                    for chunk in response.iter_content(chunk_size=chunk_size):
                        if chunk:
                            chunks.append(chunk)
                            pbar.update(len(chunk))
                
                # Combine chunks into a single bytes object
                content = b''.join(chunks)
            
            # Save CSV if filename is provided
            if csv_filename:
                with open(csv_filename, 'wb') as f:
                    f.write(content)
        else:
            response = requests.get(api_url, stream=True)
            response.raise_for_status()

            # Determine filename
            if "Content-Disposition" in response.headers:
                content_disposition = response.headers["Content-Disposition"]
                filename = content_disposition.split("filename=")[-1].strip('"')
            else:
                filename = os.path.basename(api_url)

            # If filename has no extension, try to guess from Content-Type
            if "." not in filename:
                content_type = response.headers.get("Content-Type", "")
                extension = mimetypes.guess_extension(content_type.split(";")[0].strip())
                if extension:
                    filename += extension

            # Get total file size for progress bar (in bytes)
            total_size = int(response.headers.get("content-length", 0))
            chunk_size = 8192  # 8 KB per chunk

            # Download with progress bar
            with open(filename, "wb") as f, tqdm(
                total=total_size, unit='B', unit_scale=True, desc=filename
            ) as progress_bar:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        progress_bar.update(len(chunk))
                

    # download from cer 
    def download_from_cer(self, url:str, csv_filename:str=None):
        """Extract API URL from CER datasets using Selenium and download CSV."""
        driver = webdriver.Chrome()
        driver.get(url)
        wait = WebDriverWait(driver, 15)

        # Find and click "Copy API URL" button
        button = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, "//button[.//span[contains(text(), 'Copy API URL')]]")
            )
        )
        button.click()
        time.sleep(1)  # wait for clipboard update

        api_url = pyperclip.paste()
        driver.quit()

        cer_code = url.split("/")[-1]
        api_url = (api_url.replace("ODataDataset", "Dataset")).split("/")[:-1]
        api_url = "/".join(api_url)+f"/{cer_code}.csv"
        print("Downloading from:", api_url)

        csv_filename = f"{cer_code}.csv"
        return self.status_bar_api(api_url=api_url, csv_filename=csv_filename)

    def download_cer_markets(self, url:str):
        url_header = url.split('/')[2]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
            # Step 2: Find the button/link with the XLSX file
        div_tags = soup.find_all("div", class_="cer-accordion__body__item")
        for div in div_tags:
            a_tag = div.find("a", href=True)
            if a_tag:
                text = a_tag.get_text(strip=True).lower()  # normalize text
                if "csv" in text and ("power stations" in text and "projects" in text):
                    file_href = a_tag["href"]
                    full_url = f"https://www.{url_header}{file_href}"
                    self.status_bar_api(api_url=full_url,csv_filename=None, status='file')
    
    def download_abs(self, url:str):
        url_header = url.split('/')[2]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find the link for "Population and people"
        target_div = None
        for div in soup.find_all("div", class_="file-description-link-formatter"):
            h4 = div.find("h4")
            if h4 and "Economy and industry" in h4.text:
                target_div = div
                break
        if target_div:
            a_tag = target_div.find("a", href=True)
            relative_url = a_tag['href']
            download_url = f"https://{url_header}" + relative_url
            print("Found download URL:", download_url)
            self.status_bar_api(api_url=download_url,csv_filename=None, status='file')


<h1>Retrieve Data Set </h1>

In [45]:

download = download_file()

# first dataset
print("Processing Download First Dataset ....")
cer_url = "https://data.cer.gov.au/datasets/NGER/ID0243"
download.download_from_cer(url=cer_url)
# second dataset
print("Processing Download Second Dataset ....")
cer_markets_url = "https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data"
download.download_cer_markets(url=cer_markets_url)
# third dataset
print("Processing Download Third Dataset ....")
abs_url = "https://www.abs.gov.au/methodologies/data-region-methodology/2011-24#data-downloads"
download.download_abs(url=abs_url)


Processing Download First Dataset ....
Downloading from: https://api.cer.gov.au/datahub-public/v1/api/Dataset/NGER/dataset/ID0243.csv


Downloading: 100%|██████████| 83.3k/83.3k [00:00<00:00, 681kB/s]


Processing Download Second Dataset ....


total-lgcs-and-capacity-accredited-power-stations-2025.csv: 100%|██████████| 421/421 [00:00<00:00, 2.83MB/s]
power-stations-and-projects-accredited.csv: 100%|██████████| 25.8k/25.8k [00:00<00:00, 5.83MB/s]
power-stations-and-projects-committed.csv: 100%|██████████| 1.89k/1.89k [00:00<00:00, 1.73MB/s]
power-stations-and-projects-probable.csv: 100%|██████████| 2.22k/2.22k [00:00<00:00, 6.30MB/s]


Processing Download Third Dataset ....
Found download URL: https://www.abs.gov.au/methodologies/data-region-methodology/2011-24/14100DO0003_2011-24.xlsx


14100DO0003_2011-24.xlsx: 100%|██████████| 19.7M/19.7M [00:06<00:00, 3.14MB/s]


<h1>Data Integration and Cleaning Data </h1>

In [5]:
class cleaningData:
    def __init__(self):
        pass

    def show_null_data(self, data:pd.DataFrame):
        '''
            Showing Null Data for each column in a single dataframe

            Args:
                data: Dataframe to analyze

        '''
        null_count = data.isna().sum()
        null_percent = data.isna().sum() / len(data)
        null_percent = null_percent.apply(lambda x: f"{x:.1%}")
        print("Total Duplicated Count: ", data.duplicated().sum())
        results = pd.concat([null_count, null_percent], axis=1)
        results.columns = ['Null Total Count', 'Null Percentage']
        return results
    
    def split_categorical_numerical(self, data:pd.DataFrame):
        '''
            Spliting Categorical and Numerical Columns

            Args:
                data: Dataframe
        '''
        numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
        categorical_cols = data.select_dtypes(include=['object', 'category']).columns
        
        return numerical_cols, categorical_cols
    
    def replacing_value(self,df:pd.DataFrame, existing_value, new_value):
        df.replace(existing_value, new_value, inplace=True)
        return None

    def filling_null_value(self, df:pd.DataFrame, replacing_value):
        cat_imputer = SimpleImputer(strategy='constant', fill_value="Unknown")
        df = cat_imputer.fit_transform(df)

        return None

class connectDB:
    def __init__(self):
        self.connection =  duckdb.connect('my_database.duckdb') 
    
    def create_new_table(self, df:pd.DataFrame, table_name:str):
        '''
            Creating New Table Based on Dataframe Columns


            Args:
                1. df: Dataframe or that we want to convert into database
                2. table_name: the name of the table that we want to create in duck db
        '''
        try:
            con = self.connection
            sql_types = {
                'object': 'VARCHAR',
                'int64': 'INTEGER',
                'float64': 'DOUBLE',
                'bool': 'BOOLEAN',
                'datetime64[ns]': 'TIMESTAMP'
            }

            columns_sql = []
            for col_name, dtype in df.dtypes.items():
                sql_type = sql_types.get(str(dtype), 'VARCHAR')  # Default to VARCHAR
                col_sql = f'"{col_name}" {sql_type}'
                columns_sql.append(col_sql)

            columns_str = ",\n    ".join(columns_sql)
            create_table_sql = f"CREATE TABLE {table_name} (\n    {columns_str}\n);"
            con.execute(create_table_sql)
            con.close()
        except Exception as err:
            con.close()
            print(err)
        
        return None
    
    def insert_data(self, df:pd.DataFrame, table_name: str, column_map: dict):
        '''
        Inserting database different column name with the csv

        Args:
            df: Dataframe to insert
            table_name: Name of the existing DuckDB table
            column_map: Mapping from dataframe columns to table columns
        
        column_map = {'df_col1': 'table_colA', 'df_col2': 'table_colB'}
        '''
        try:
            con = self.connection

            # Prepare SELECT clause using mapping
            select_clause = ", ".join([f'"{df_col}" AS "{table_col}"' for df_col, table_col in column_map.items()])
            
            # Register DataFrame as a relation
            con.register("df_temp", df)
            
            # Build INSERT SQL
            sql = f"""
            INSERT INTO {table_name} ({', '.join(column_map.values())})
            SELECT {select_clause} FROM df_temp
            """
            con.execute(sql)
            con.close()
        except Exception as err:
            con.close()
            print(err)
        


        


In [6]:

# first url file
df_nger = pd.read_csv("ID0243.csv")

# second url file
df_cer_1_approved = pd.read_csv("power-stations-and-projects-accredited.csv")
df_cer_1_commited = pd.read_csv("power-stations-and-projects-committed.csv")
df_cer_1_probable = pd.read_csv("power-stations-and-projects-probable.csv")


# third url file
df_abs = pd.ExcelFile("14100DO0003_2011-24.xlsx")
statistical_area = pd.read_excel(df_abs, 'Table 1', header=[5,6])
lga_area = pd.read_excel(df_abs, 'Table 2', header=[5,6])





#### Cleaning the NGER.CSV dataset and put it into DB

In [7]:
# module for Cleaning Data
cleaning_data = cleaningData()


df_nger_numerical, df_nger_categorical = cleaning_data.split_categorical_numerical(df_nger) 
cleaning_data.replacing_value(df_nger, '-', np.nan)

df_nger_cleaning = df_nger.drop(columns=['Electricity production GJ', 'Total emissions t CO2 e'])
df_nger_clean = cleaning_data.filling_null_value(df_nger_cleaning[df_nger_categorical], "Unknown")

df_nger_cleaning.describe()



Unnamed: 0,Electricity production MWh,Total scope 1 emissions t CO2 e,Total scope 2 emissions t CO2 e,Emission intensity t CO2 e MWh
count,775.0,775.0,775.0,624.0
mean,565988.1,360211.1,5419.553548,0.330769
std,2308429.0,2104691.0,43243.487455,0.50401
min,0.0,0.0,0.0,0.0
25%,4636.0,24.0,0.0,0.0
50%,72328.0,308.0,29.0,0.07
75%,290367.0,3538.5,614.5,0.66
max,37372270.0,32840740.0,691029.0,8.57


In [8]:
cleaning_data.replacing_value(df_cer_1_approved, '-', np.nan)

df_cer_1_approved['power_station_name_clean'] = df_cer_1_approved['Power station name'].str.split('-').str[0]
cleaning_data.show_null_data(df_cer_1_approved)


Total Duplicated Count:  0


Unnamed: 0,Null Total Count,Null Percentage
Accreditation code,0,0.0%
Power station name,0,0.0%
State,0,0.0%
Postcode,0,0.0%
Installed capacity (MW),0,0.0%
Fuel Source (s),0,0.0%
Accreditation start date,0,0.0%
Approval date,0,0.0%
power_station_name_clean,0,0.0%


In [9]:
cleaning_data.replacing_value(df_cer_1_commited, '-', np.nan)

cleaning_data.show_null_data(df_cer_1_commited)

Total Duplicated Count:  0


Unnamed: 0,Null Total Count,Null Percentage
Project Name,0,0.0%
State,0,0.0%
MW Capacity,0,0.0%
Fuel Source,0,0.0%
Committed Date (Month/Year),0,0.0%


In [10]:
cleaning_data.replacing_value(df_cer_1_probable, '-', np.nan)
cleaning_data.show_null_data(df_cer_1_probable)

Total Duplicated Count:  0


Unnamed: 0,Null Total Count,Null Percentage
Project Name,0,0.0%
State,0,0.0%
MW Capacity,0,0.0%
Fuel Source,0,0.0%


In [11]:
category_question = defaultdict(list)
for cat, q in statistical_area.columns:
    category_question[cat].append(q)
category_question = dict(category_question)

In [12]:
# only get the important rows
statistical_area = statistical_area.iloc[:-7]
lga_area = lga_area.iloc[:-7]

In [None]:
cleaning_data.replacing_value(statistical_area, '-', np.nan)

statistical_area[("Unnamed: 2_level_0", "Year")] = pd.to_numeric(statistical_area[("Unnamed: 2_level_0", "Year")], errors="coerce").astype("Int64") 
statistical_area.head()

  df.replace(existing_value, new_value, inplace=True)


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Business entries - year ended 30 June,Business entries - year ended 30 June,...,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter
Unnamed: 0_level_1,Code,Label,Year,Number of non-employing businesses,Number of employing businesses: 1-4 employees,Number of employing businesses: 5-19 employees,Number of employing businesses: 20 or more employees,Total number of businesses,Number of non-employing business entries,Number of employing business entries: 1-4 employees,...,Houses - total (no.),Townhouses - additions (no.),Townhouses - removals (no.),Townhouses - total (no.),Apartments - additions (no.),Apartments - removals (no.),Apartments - total (no.),Total dwelling additions (no.),Total dwelling removals (no.),Total dwellings (no.)
0,AUS,Australia,2011,,,,,,,,...,,,,,,,,,,
1,AUS,Australia,2016,,,,,,,,...,,,,,,,,,,
2,AUS,Australia,2017,,,,,,,,...,7279135.0,31729.0,480.0,1261457.0,73838.0,564.0,1464325.0,218884.0,19586.0,10022775.0
3,AUS,Australia,2018,,,,,,,,...,7371408.0,33346.0,919.0,1293884.0,66330.0,605.0,1530050.0,213652.0,23181.0,10213246.0
4,AUS,Australia,2019,,,,,,,,...,7472853.0,32571.0,712.0,1325743.0,60705.0,985.0,1589770.0,215556.0,22394.0,10406408.0


In [22]:
cleaning_data.replacing_value(lga_area, '-', np.nan)

lga_area[("Unnamed: 2_level_0", "Year")] = pd.to_numeric(lga_area[("Unnamed: 2_level_0", "Year")], errors="coerce").astype("Int64") 
lga_area.head()

  df.replace(existing_value, new_value, inplace=True)


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Business entries - year ended 30 June,Business entries - year ended 30 June,...,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census
Unnamed: 0_level_1,Code,Label,Year,Number of non-employing businesses,Number of employing businesses: 1-4 employees,Number of employing businesses: 5-19 employees,Number of employing businesses: 20 or more employees,Total number of businesses,Number of non-employing business entries,Number of employing business entries: 1-4 employees,...,"Rental, hiring and real estate services (%)","Professional, scientific and technical services (%)",Administrative and support services (%),Public administration and safety (%),Education and training (%),Health care and social assistance (%),Arts and recreation services (%),Other services (%),Industry of employment inadequately described or not stated (%),Total persons employed aged 15 years and over (no.)
0,10050,Albury,2011,,,,,,,,...,1.3,4.5,2.8,7.5,8.7,13.7,1.0,4.0,2.1,22434.0
1,10050,Albury,2016,,,,,,,,...,1.3,4.3,3.0,7.3,9.6,15.2,0.9,4.1,3.6,22901.0
2,10050,Albury,2017,,,,,,,,...,,,,,,,,,,
3,10050,Albury,2018,,,,,,,,...,,,,,,,,,,
4,10050,Albury,2019,,,,,,,,...,,,,,,,,,,


### Data Augmentation

In [38]:
class geoAPI:
    def __init__(self):
        pass

    def get(self, query):
        url = "https://nominatim.openstreetmap.org/search"
        params = {
            "q": query,
            "format": "json",
            "limit": 1
        }
        headers = {"User-Agent": "powerstation-geocoder"}  # Required by Nominatim
        response = requests.get(url, params=params, headers=headers)
        
        if response.status_code == 200 and len(response.json()) > 0:
            result = response.json()[0]
            return float(result["lat"]), float(result["lon"])
        else:
            return None, None
        # return response.json()
        # if response.status_code == 200 and len(response.json()) > 0:
        #     result = response.json()[0]
        #     return float(result["lat"]), float(result["lon"])
        # else:
        #     return None, None

In [39]:
api = geoAPI()

api.get("Laura Johnson Home, Townview")

(None, None)

In [23]:
url = "http://wiki.openstreetmap.org/wiki/API"
headers = {
    "Content-Type": "application/json"
}

response = requests.get(url)
data = response.json()
print(data)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

#### Process Creating Schema in Duck DB

In [14]:
# module related for database
database = connectDB()

column_name_facility = ['id','reporting_entity','facility_name','type','state',
                        'electricity_production_mwh','scope_1_emission', 
                        'scope_2_emission','emission_intesity_mwh','grid_connected',
                        'grid','primary_fuel', 'important_notes']

column_names_power_stations = ['id', 'accreditation_code', 'power_station_name','state','postcode',
                               'capacity_mw','source', 'accreditation_start_date','approval_date','commited_date','status']

# Create an empty DataFrame with the specified columns

In [15]:

print("----------")
df_cer_1_approved.replace('-', np.nan, inplace=True)

----------


In [16]:
print("----------")
#show_null_data(df_cer_1_commited)

----------


In [17]:
print("----------")
#show_null_data(df_cer_1_probable)



----------


In [18]:
print("----------")
#show_null_data(df_cer_2)

----------


In [19]:
#show_null_data(df_cer_3)

In [20]:

print("----------")
#show_null_data(statistical_area)


----------


In [21]:
print("----------")
#show_null_data(lga_area)

----------
