In [116]:
import requests
import pandas as pd
from tqdm import tqdm 
from bs4 import BeautifulSoup
import os
import mimetypes


# selenium method
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pyperclip
import time




In [117]:
# ------------------------------------- Function List ----------------------------------------

list_file_name = set()
class download_file:
    def status_bar_api(self, api_url, csv_filename = None, status = 'api'):
        # Stream the download
        if status == 'api':
            with requests.get(api_url, stream=True) as response:
                response.raise_for_status()
                total_size = int(response.headers.get('content-length', 0))
                chunk_size = 1024 * 1024  # 1 MB chunks
                
                chunks = []
                with tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading') as pbar:
                    for chunk in response.iter_content(chunk_size=chunk_size):
                        if chunk:
                            chunks.append(chunk)
                            pbar.update(len(chunk))
                
                # Combine chunks into a single bytes object
                content = b''.join(chunks)
            
            # Save CSV if filename is provided
            if csv_filename:
                with open(csv_filename, 'wb') as f:
                    f.write(content)
                    list_file_name.add(csv_filename)
        else:
            response = requests.get(api_url, stream=True)
            response.raise_for_status()

            # Determine filename
            if "Content-Disposition" in response.headers:
                content_disposition = response.headers["Content-Disposition"]
                filename = content_disposition.split("filename=")[-1].strip('"')
            else:
                filename = os.path.basename(api_url)

            # If filename has no extension, try to guess from Content-Type
            if "." not in filename:
                content_type = response.headers.get("Content-Type", "")
                extension = mimetypes.guess_extension(content_type.split(";")[0].strip())
                if extension:
                    filename += extension

            # Get total file size for progress bar (in bytes)
            total_size = int(response.headers.get("content-length", 0))
            chunk_size = 8192  # 8 KB per chunk

            # Download with progress bar
            with open(filename, "wb") as f, tqdm(
                total=total_size, unit='B', unit_scale=True, desc=filename
            ) as progress_bar:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        progress_bar.update(len(chunk))
                list_file_name.add(filename)
        
    # Download from cer datasets
    def download_from_cer(self, url, csv_filename=None):
        driver = webdriver.Chrome()
        driver.get(url)

        wait = WebDriverWait(driver, 15)

        # Find the button by inner span text
        button = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, "//button[.//span[contains(text(), 'Copy API URL')]]")
            )
        )

        button.click()

        time.sleep(1)  # wait for clipboard update

        api_url = pyperclip.paste()
        print("Copied API URL:", api_url)
        driver.quit()


        cer_code = url.split('/')[-1]
        api_url = api_url.rsplit("/", 1)[0]
        api_url = f"https://api.cer.gov.au/datahub-public/v1/api/Dataset/NGER/dataset/{cer_code}.csv"
        print("downloading from:" + api_url)
        csv_filename = f"{cer_code}.csv"
        # Stream the download
        self.status_bar_api(api_url=api_url, csv_filename=csv_filename, status="api")

    def download_cer_markets(self, url):
        url_header = url.split('/')[2]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
            # Step 2: Find the button/link with the XLSX file
        div_tags = soup.find_all("div", class_="cer-accordion__body__item")
        for div in div_tags:
            a_tag = div.find("a", href=True)
            if "XLSX" in a_tag.get_text(strip=True): 
                download_href = a_tag["href"]
                download_url = f"https://www.{url_header}" + download_href
                self.status_bar_api(api_url=download_url,csv_filename=None, status='file')
    
    def download_abs(self, url):
        url_header = url.split('/')[2]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find the link for "Population and people"
        target_div = None
        for div in soup.find_all("div", class_="file-description-link-formatter"):
            h4 = div.find("h4")
            if h4 and "Economy and industry" in h4.text:
                target_div = div
                break
        if target_div:
            a_tag = target_div.find("a", href=True)
            relative_url = a_tag['href']
            download_url = f"https://{url_header}" + relative_url
            print("Found download URL:", download_url)
            self.status_bar_api(api_url=download_url,csv_filename=None, status='file')



In [118]:

download = download_file()

# first dataset
print("Processing Download First Dataset ....")
cer_url = "https://data.cer.gov.au/datasets/NGER/ID0243"
download.download_from_cer(url=cer_url)
# # second dataset
print("Processing Download Second Dataset ....")
cer_markets_url = "https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data"
download.download_cer_markets(url=cer_markets_url)
# third dataset
print("Processing Download Third Dataset ....")
abs_url = "https://www.abs.gov.au/methodologies/data-region-methodology/2011-24#data-downloads"
download.download_abs(url=abs_url)


Processing Download First Dataset ....
Copied API URL: https://api.cer.gov.au/datahub-public/v1/api/ODataDataset/NGER/dataset/ID0243?select%3D%2A
downloading from:https://api.cer.gov.au/datahub-public/v1/api/Dataset/NGER/dataset/ID0243.csv


Downloading: 100%|██████████| 83.3k/83.3k [00:00<00:00, 1.91MB/s]


Processing Download Second Dataset ....


total-lgcs-and-capacity-accredited-power-stations-2025-0.xlsx: 100%|██████████| 12.4k/12.4k [00:00<00:00, 13.8MB/s]
power-stations-and-projects-status.xlsx: 100%|██████████| 43.3k/43.3k [00:00<00:00, 3.76MB/s]
power-stations-and-projects-status.xlsx: 100%|██████████| 43.3k/43.3k [00:00<00:00, 934kB/s]
power-stations-and-projects-status.xlsx: 100%|██████████| 43.3k/43.3k [00:00<00:00, 2.68MB/s]
total-lgcs-rec-registry.xlsx: 100%|██████████| 50.7k/50.7k [00:00<00:00, 2.36MB/s]


Processing Download Third Dataset ....
Found download URL: https://www.abs.gov.au/methodologies/data-region-methodology/2011-24/14100DO0003_2011-24.xlsx


14100DO0003_2011-24.xlsx: 100%|██████████| 19.7M/19.7M [00:05<00:00, 3.48MB/s]


In [119]:
list_file_name

{'14100DO0003_2011-24.xlsx',
 'ID0243.csv',
 'power-stations-and-projects-status.xlsx',
 'total-lgcs-and-capacity-accredited-power-stations-2025-0.xlsx',
 'total-lgcs-rec-registry.xlsx'}

<h1>Data Integration and Cleaning Data </h1>

In [None]:
class clean_data:
    def __init__(self, df):
        self.df = df

In [122]:

# first url file
df_cer = pd.read_csv("ID0243.csv")

# second url file
df_cer_1 = pd.ExcelFile("power-stations-and-projects-status.xlsx")
df_cer_1_approved = pd.read_excel(df_cer_1, 'Approved', header=3)
df_cer_1_commited = pd.read_excel(df_cer_1, 'Committed', header=3)
df_cer_1_probable = pd.read_excel(df_cer_1, 'Probable', header=3)


df_cer_2 = pd.read_excel("total-lgcs-and-capacity-accredited-power-stations-2025-0.xlsx", header=2)
df_cer_3 = pd.read_excel("total-lgcs-rec-registry.xlsx", header=2)

# third url file
df_abs = pd.ExcelFile("14100DO0003_2011-24.xlsx")
statistical_area = pd.read_excel(df_abs, 'Table 1', header=[5,6])
lga_area = pd.read_excel(df_abs, 'Table 2', header=[5,6])


df_cer.head()




Unnamed: 0,Reporting entity,Facility name,Type,State,Electricity production GJ,Electricity production MWh,Total scope 1 emissions t CO2 e,Total scope 2 emissions t CO2 e,Total emissions t CO2 e,Emission intensity t CO2 e MWh,Grid connected,Grid,Primary fuel,Important notes
0,ACCIONA ENERGY OCEANIA PTY LTD,Cathedral Rocks Wind Farm,F,SA,481948,133874,57,127.0,184,0.0,On,NEM,Wind,-
1,ACCIONA ENERGY OCEANIA PTY LTD,Gunning Wind Farm,F,NSW,491409,136502,50,218.0,268,0.0,On,NEM,Wind,-
2,ACCIONA ENERGY OCEANIA PTY LTD,Mortlake South Wind Farm,F,VIC,1019352,283153,202,1128.0,1330,0.0,On,NEM,Wind,-
3,ACCIONA ENERGY OCEANIA PTY LTD,Mt Gellibrand Wind Farm,F,VIC,1025451,284847,99,1273.0,1372,0.0,On,NEM,Wind,-
4,ACCIONA ENERGY OCEANIA PTY LTD,Waubra Wind Farm,F,VIC,1954964,543046,186,1114.0,1300,0.0,On,NEM,Wind,-


In [123]:

print("----------")
df_cer_1_approved.head()


----------


Unnamed: 0,Accreditation code,Power station name,State,Postcode,Installed capacity (MW),Fuel Source (s),Accreditation start date,Approval date
0,SRPXQLE8,"Laura Johnson Home, Townview - Solar w SGU - QLD",QLD,4825.0,0.2265,Solar,2024-10-15,2025-01-13
1,SRPYNS39,Leppington - Solar - NSW,NSW,2179.0,0.732,Solar,2024-11-22,2025-01-13
2,SRPYNS58,Quakers Hillside Care Community - Solar w SGU ...,NSW,2763.0,0.1996,Solar,2024-12-19,2025-01-13
3,SRPXVCN4,Rest Nominees - Solar wSGU - VIC,VIC,3008.0,0.1188,Solar,2024-09-20,2025-01-13
4,SRPXQLF9,Retail First Mt Ommaney-Solar-QLD,QLD,4074.0,1.0004,Solar,2024-10-29,2025-01-13


In [124]:
print("----------")
df_cer_1_commited.head()

----------


Unnamed: 0,Project Name,State,MW Capacity,Fuel Source,Committed Date (Month/Year)
0,East Rockingham Resource Recovery Facility,WA,29.0,Biomass,2019-12-23
1,Mangalore Renewable Energy Project,VIC,5.0,Solar,2021-09-02
2,Orange Community Renewable Energy Park,NSW,5.0,Solar,2022-07-24
3,Moorebank Logistics Park,NSW,60.0,Solar,2022-09-21
4,Wangaratta Solar Farm,VIC,40.0,Solar,2023-07-04


In [125]:
df_cer_1_probable.head()

Unnamed: 0,Project Name,State,MW Capacity,Fuel Source
0,Barnawartha Solar Farm,VIC,64.0,Solar
1,Barwon solar farm,VIC,250.0,Solar
2,Boddington Giga Energy,WA,400.0,Solar
3,Bulli Creek Solar project Stage 1,QLD,775.0,Solar
4,Bullyard Solar Farm,QLD,100.0,Solar


In [126]:
print("----------")
df_cer_2.head()


----------


Unnamed: 0,Total LGCs in the REC Registry,MW of approved power stations (since 1 Jan 2025),Approved power stations (since 1 Jan 2025),As at
0,49704295,52.686,23,2025-01-31
1,22950064,91.0868,56,2025-02-28
2,26467605,298.9236,87,2025-03-31
3,31063147,852.0156,127,2025-04-30
4,34495174,1727.3873,163,2025-05-31


In [127]:
print("----------")
df_cer_3.head()

----------


Unnamed: 0,Full name of account,Total registered LGC holdings
0,Johnson and Johnson Medical Pty Ltd,250
1,Solar Juice Pty Ltd,947
2,GCFC Ltd,758
3,Bruce Rouse,27
4,Helen Brewer,220


In [128]:

print("----------")
statistical_area.head()


----------


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Business entries - year ended 30 June,Business entries - year ended 30 June,...,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter
Unnamed: 0_level_1,Code,Label,Year,Number of non-employing businesses,Number of employing businesses: 1-4 employees,Number of employing businesses: 5-19 employees,Number of employing businesses: 20 or more employees,Total number of businesses,Number of non-employing business entries,Number of employing business entries: 1-4 employees,...,Houses - total (no.),Townhouses - additions (no.),Townhouses - removals (no.),Townhouses - total (no.),Apartments - additions (no.),Apartments - removals (no.),Apartments - total (no.),Total dwelling additions (no.),Total dwelling removals (no.),Total dwellings (no.)
0,AUS,Australia,2011.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1,AUS,Australia,2016.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
2,AUS,Australia,2017.0,-,-,-,-,-,-,-,...,7279135,31729,480,1261457,73838,564,1464325,218884,19586,10022775
3,AUS,Australia,2018.0,-,-,-,-,-,-,-,...,7371408,33346,919,1293884,66330,605,1530050,213652,23181,10213246
4,AUS,Australia,2019.0,-,-,-,-,-,-,-,...,7472853,32571,712,1325743,60705,985,1589770,215556,22394,10406408


In [129]:
print("----------")
lga_area.head()

----------


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Business entries - year ended 30 June,Business entries - year ended 30 June,...,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census
Unnamed: 0_level_1,Code,Label,Year,Number of non-employing businesses,Number of employing businesses: 1-4 employees,Number of employing businesses: 5-19 employees,Number of employing businesses: 20 or more employees,Total number of businesses,Number of non-employing business entries,Number of employing business entries: 1-4 employees,...,"Rental, hiring and real estate services (%)","Professional, scientific and technical services (%)",Administrative and support services (%),Public administration and safety (%),Education and training (%),Health care and social assistance (%),Arts and recreation services (%),Other services (%),Industry of employment inadequately described or not stated (%),Total persons employed aged 15 years and over (no.)
0,10050,Albury,2011.0,-,-,-,-,-,-,-,...,1.3,4.5,2.8,7.5,8.7,13.7,1,4,2.1,22434
1,10050,Albury,2016.0,-,-,-,-,-,-,-,...,1.3,4.3,3,7.3,9.6,15.2,0.9,4.1,3.6,22901
2,10050,Albury,2017.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,10050,Albury,2018.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
4,10050,Albury,2019.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
