In [185]:
import requests
import pandas as pd
from tqdm import tqdm  # progress bar
from bs4 import BeautifulSoup
import os
import mimetypes

In [188]:
# ------------------------------------- Function List ----------------------------------------
# 1 Status Bar to know the download progress bar
def status_bar_api(api_url, csv_filename):
    # Stream the download
    with requests.get(api_url, stream=True) as response:
        response.raise_for_status()
        total_size = int(response.headers.get('content-length', 0))
        chunk_size = 1024 * 1024  # 1 MB chunks
        
        chunks = []
        with tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading') as pbar:
            for chunk in response.iter_content(chunk_size=chunk_size):
                if chunk:
                    chunks.append(chunk)
                    pbar.update(len(chunk))
        
        # Combine chunks into a single bytes object
        content = b''.join(chunks)
    
    # Save CSV if filename is provided
    if csv_filename:
        with open(csv_filename, 'wb') as f:
            f.write(content)

def download_file(url):
    response = requests.get(url, stream=True)
    response.raise_for_status()

    # Determine filename
    if "Content-Disposition" in response.headers:
        content_disposition = response.headers["Content-Disposition"]
        filename = content_disposition.split("filename=")[-1].strip('"')
    else:
        filename = os.path.basename(url)

    # If filename has no extension, try to guess from Content-Type
    if "." not in filename:
        content_type = response.headers.get("Content-Type", "")
        extension = mimetypes.guess_extension(content_type.split(";")[0].strip())
        if extension:
            filename += extension

    # Get total file size for progress bar (in bytes)
    total_size = int(response.headers.get("content-length", 0))
    chunk_size = 8192  # 8 KB per chunk

    # Download with progress bar
    with open(filename, "wb") as f, tqdm(
        total=total_size, unit='B', unit_scale=True, desc=filename
    ) as progress_bar:
        for chunk in response.iter_content(chunk_size=chunk_size):
            if chunk:
                f.write(chunk)
                progress_bar.update(len(chunk))
    
# Download from cer datasets
def download_from_cer(url, csv_filename=None):
    cer_code = url.split('/')[-1]
    api_url = f"https://api.cer.gov.au/datahub-public/v1/api/Dataset/NGER/dataset/{cer_code}.csv"
    print("downloading from:" + api_url)
    csv_filename = f"{cer_code}.csv"
    # Stream the download
    status_bar_api(api_url=api_url, csv_filename=csv_filename)

def download_cer_markets(url):
    url_header = url.split('/')[2]
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
        # Step 2: Find the button/link with the XLSX file
    div_tags = soup.find_all("div", class_="cer-accordion__body__item")
    for div in div_tags:
        a_tag = div.find("a", href=True)
        if "XLSX" in a_tag.get_text(strip=True): 
            download_href = a_tag["href"]
            download_url = f"https://www.{url_header}" + download_href
            download_file(download_url)



In [190]:
# first dataset
cer_url = "https://data.cer.gov.au/datasets/NGER/ID0243"
download_from_cer(cer_url)
# second dataset
cer_markets_url = "https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data"
download_cer_markets(url=cer_markets_url)

downloading from:https://api.cer.gov.au/datahub-public/v1/api/Dataset/NGER/dataset/ID0243.csv


Downloading: 100%|██████████| 83.3k/83.3k [00:00<00:00, 1.24MB/s]
total-lgcs-and-capacity-accredited-power-stations-2025-0.xlsx: 100%|██████████| 12.3k/12.3k [00:00<00:00, 20.6MB/s]
power-stations-and-projects-status.xlsx: 100%|██████████| 41.3k/41.3k [00:00<00:00, 3.40MB/s]
power-stations-and-projects-status.xlsx: 100%|██████████| 41.3k/41.3k [00:00<00:00, 3.16MB/s]
power-stations-and-projects-status.xlsx: 100%|██████████| 41.3k/41.3k [00:00<00:00, 3.12MB/s]
total-lgcs-rec-registry.xlsx: 100%|██████████| 50.7k/50.7k [00:00<00:00, 3.51MB/s]
