In [None]:
import requests
import pandas as pd
from tqdm import tqdm 
from bs4 import BeautifulSoup
import os
import mimetypes
import numpy as np
import os
import re

# selenium method
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options


import time

# analysis
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer


# duckdb
import duckdb
from collections import defaultdict


In [216]:
class DownloadFile:
    """Utility class for downloading files with progress and CER/ABS support."""

    def status_bar_api(self, api_url: str, csv_filename: str = None):
        """
        Download a file from API URL with progress bar.

        Args:
            api_url: Direct URL to download.
            csv_filename: Optional filename; if None, determined from URL or headers.
        """
        response = requests.get(api_url, stream=True)
        response.raise_for_status()

        # Determine filename
        filename = csv_filename or os.path.basename(api_url)
        if "Content-Disposition" in response.headers:
            content_disposition = response.headers["Content-Disposition"]
            filename = content_disposition.split("filename=")[-1].strip('"')

        # Guess extension if missing
        if "." not in filename:
            content_type = response.headers.get("Content-Type", "")
            ext = mimetypes.guess_extension(content_type.split(";")[0].strip())
            if ext:
                filename += ext

        total_size = int(response.headers.get("content-length", 0))
        chunk_size = 8192  # 8 KB

        with open(filename, "wb") as f, tqdm(
            total=total_size, unit="B", unit_scale=True, desc=filename
        ) as progress_bar:
            for chunk in response.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)
                    progress_bar.update(len(chunk))

        print(f"✅ Download completed: {filename}")
        return filename

    def wait_for_download(self, download_dir: str, timeout: int = 60):
        """
        Wait until a new file appears in download_dir (ignores .crdownload files).

        Args:
            download_dir: Directory to watch for downloaded files.
            timeout: Max wait time in seconds.

        Returns:
            Full path to the most recently downloaded file.
        """
        seconds = 0
        while seconds < timeout:
            files = os.listdir(download_dir)
            downloading = [f for f in files if f.endswith(".crdownload")]
            if not downloading and files:
                # Return most recently created file
                return os.path.join(
                    download_dir,
                    max(files, key=lambda f: os.path.getctime(os.path.join(download_dir, f)))
                )
            time.sleep(1)
            seconds += 1
        raise TimeoutError("Download did not complete in time.")

    def download_from_cer(self, url: str):
        """
        Extract API URL from CER datasets using Selenium and download CSV.

        Args:
            url: CER dataset page URL.
        """
        download_dir = os.getcwd()

        # Chrome options (headless)
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        prefs = {
            "download.default_directory": download_dir,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True
        }
        chrome_options.add_experimental_option("prefs", prefs)

        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        wait = WebDriverWait(driver, 20)

        # Click the "Download CSV" button
        download_button = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Download CSV')]"))
        )
        download_button.click()

        # Wait for download to finish
        time.sleep(2)  # small wait for file creation
        downloaded_file = self.wait_for_download(download_dir)
        driver.quit()

        print(f"✅ CER CSV downloaded: {downloaded_file}")
        return downloaded_file

    def download_cer_markets(self, url: str):
        """
        Download CER markets CSV for power stations/projects.

        Args:
            url: CER markets page URL.
        """
        url_header = url.split('/')[2]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all divs containing file links
        for div in soup.find_all("div", class_="cer-accordion__body__item"):
            a_tag = div.find("a", href=True)
            if a_tag:
                text = a_tag.get_text(strip=True).lower()
                if "csv" in text and "power stations" in text and "projects" in text:
                    file_href = a_tag["href"]
                    full_url = f"https://{url_header}{file_href}"
                    self.status_bar_api(api_url=full_url)

    def download_abs(self, url: str, target_text="Economy and industry"):
        """
        Download ABS data CSV based on section name.

        Args:
            url: ABS Data by Regions page URL.
            target_text: Section header to look for (default "Economy and industry").
        """
        url_header = url.split('/')[2]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        target_div = None
        for div in soup.find_all("div", class_="file-description-link-formatter"):
            h4 = div.find("h4")
            if h4 and target_text in h4.text:
                target_div = div
                break

        if target_div:
            a_tag = target_div.find("a", href=True)
            download_url = f"https://{url_header}{a_tag['href']}"
            print("Found download URL:", download_url)
            self.status_bar_api(api_url=download_url)


<h1>Retrieve Data Set </h1>

In [217]:

def retrieve_data_set():
    download = DownloadFile()
    # first dataset
    print("Processing Download First Dataset ....")
    cer_url = "https://data.cer.gov.au/datasets/NGER/ID0243"
    download.download_from_cer(url=cer_url)
    # second dataset
    print("Processing Download Second Dataset ....")
    cer_markets_url = "https://cer.gov.au/markets/reports-and-data/large-scale-renewable-energy-data"
    download.download_cer_markets(url=cer_markets_url)
    # third dataset
    print("Processing Download Third Dataset ....")
    abs_url = "https://www.abs.gov.au/methodologies/data-region-methodology/2011-24#data-downloads"
    download.download_abs(url=abs_url)


In [218]:
retrieve_data_set()

Processing Download First Dataset ....
✅ CER CSV downloaded: /Users/stephensusanto/Desktop/Intro to Programming/DE/NGER.ID0243 (1).csv
Processing Download Second Dataset ....


power-stations-and-projects-accredited.csv: 100%|██████████| 25.8k/25.8k [00:00<00:00, 1.16MB/s]


✅ Download completed: power-stations-and-projects-accredited.csv


power-stations-and-projects-committed.csv: 100%|██████████| 1.89k/1.89k [00:00<00:00, 9.35MB/s]


✅ Download completed: power-stations-and-projects-committed.csv


power-stations-and-projects-probable.csv: 100%|██████████| 2.22k/2.22k [00:00<00:00, 9.87MB/s]


✅ Download completed: power-stations-and-projects-probable.csv
Processing Download Third Dataset ....
Found download URL: https://www.abs.gov.au/methodologies/data-region-methodology/2011-24/14100DO0003_2011-24.xlsx


14100DO0003_2011-24.xlsx: 100%|██████████| 19.7M/19.7M [00:02<00:00, 7.83MB/s]


✅ Download completed: 14100DO0003_2011-24.xlsx


<h1>Data Integration and Cleaning Data </h1>

In [250]:
class cleaningData:
    def __init__(self):
        pass

    def show_null_data(self, data:pd.DataFrame):
        '''
            Showing Null Data for each column in a single dataframe

            Args:
                data: Dataframe to analyze

        '''
        null_count = data.isna().sum()
        null_percent = data.isna().sum() / len(data)
        null_percent = null_percent.apply(lambda x: f"{x:.1%}")
        print("Total Duplicated Count: ", data.duplicated().sum())
        results = pd.concat([null_count, null_percent], axis=1)
        results.columns = ['Null Total Count', 'Null Percentage']
        return results
    
    def split_categorical_numerical(self, data:pd.DataFrame):
        '''
            Spliting Categorical and Numerical Columns

            Args:
                data: Dataframe
        '''
        numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
        categorical_cols = data.select_dtypes(include=['object', 'category']).columns
        
        return numerical_cols, categorical_cols
    
    def replacing_value(self,df:pd.DataFrame, existing_value, new_value):
        df.replace(existing_value, new_value, inplace=True)
        return None

    def filling_null_value(self, df:pd.DataFrame, replacing_value):
        cat_imputer = SimpleImputer(strategy='constant', fill_value="Unknown")
        df = cat_imputer.fit_transform(df)
    
    def clean_name(self, name: str) -> str:
        """
        Clean up company suffixes and brackets from a power station name.
        """
        if not isinstance(name, str):
            return name
        
        # Remove Pty Ltd, Ltd, Limited (case-insensitive)
        name = re.sub(r"\b(Pty|Ltd|Limited)\b", "", name, flags=re.IGNORECASE)

        # Remove anything inside parentheses (...)
        name = re.sub(r"\(.*?\)", "", name)

        # Remove multiple spaces and strip edges
        name = re.sub(r"\s+", " ", name).strip()
        return name


class ConnectDB:
    """DuckDB database connection and operations."""

    SQL_TYPES = {
        'object': 'VARCHAR',
        'int64': 'INTEGER',
        'float64': 'DOUBLE',
        'bool': 'BOOLEAN',
        'datetime64[ns]': 'TIMESTAMP'
    }

    def __init__(self, db_path: str = 'my_database.duckdb'):
        self.connection = duckdb.connect(db_path)

    def create_table(self, df: pd.DataFrame, table_name: str):
        """
        Create a new table in DuckDB based on DataFrame columns.

        Args:
            df: DataFrame to convert to table.
            table_name: Name of the table to create.
        """
        try:
            columns_sql = []
            for col_name, dtype in df.dtypes.items():
                sql_type = self.SQL_TYPES.get(str(dtype), 'VARCHAR')
                columns_sql.append(f'"{col_name}" {sql_type}')

            columns_str = ",\n    ".join(columns_sql)
            sql = f"CREATE TABLE IF NOT EXISTS {table_name} (\n    {columns_str}\n);"
            self.connection.execute(sql)

        except Exception as e:
            print("Error creating table:", e)

    def insert_data(self, df: pd.DataFrame, table_name: str, column_map: dict):
        """
        Insert DataFrame into DuckDB table with optional column mapping.

        Args:
            df: DataFrame to insert.
            table_name: Name of the target table.
            column_map: Mapping from DataFrame columns to table columns.
                        Example: {'df_col1': 'table_colA'}
        """
        try:
            select_clause = ", ".join([f'"{df_col}" AS "{table_col}"' for df_col, table_col in column_map.items()])
            self.connection.register("df_temp", df)
            sql = f"""
                INSERT INTO {table_name} ({', '.join(column_map.values())})
                SELECT {select_clause} FROM df_temp;
            """
            self.connection.execute(sql)
        except Exception as e:
            print("Error inserting data:", e)
        


        


In [220]:


# first url file
df_nger = pd.read_csv("NGER.ID0243.csv")

# second url file
df_cer_1_approved = pd.read_csv("power-stations-and-projects-accredited.csv")
df_cer_1_commited = pd.read_csv("power-stations-and-projects-committed.csv")
df_cer_1_probable = pd.read_csv("power-stations-and-projects-probable.csv")

# third url file
df_abs = pd.ExcelFile("14100DO0003_2011-24.xlsx")
statistical_area = pd.read_excel(df_abs, 'Table 1', header=[5,6])
lga_area = pd.read_excel(df_abs, 'Table 2', header=[5,6])





#### Cleaning the NGER.CSV dataset and put it into DB

In [262]:
# module for Cleaning Data
cleaning_data = cleaningData()


df_nger_numerical, df_nger_categorical = cleaning_data.split_categorical_numerical(df_nger) 
cleaning_data.replacing_value(df_nger, '-', np.nan)

df_nger_cleaning = df_nger.drop(columns=['Electricity production GJ', 'Total emissions t CO2 e'])
cleaning_data.filling_null_value(df_nger_cleaning[df_nger_categorical], "Unknown")

df_nger_cleaning.describe()



Unnamed: 0,Electricity production MWh,Total scope 1 emissions t CO2 e,Total scope 2 emissions t CO2 e,Emission intensity t CO2 e MWh
count,775.0,775.0,775.0,624.0
mean,565988.1,360211.1,5419.553548,0.330769
std,2308429.0,2104691.0,43243.487455,0.50401
min,0.0,0.0,0.0,0.0
25%,4636.0,24.0,0.0,0.0
50%,72328.0,308.0,29.0,0.07
75%,290367.0,3538.5,614.5,0.66
max,37372270.0,32840740.0,691029.0,8.57


In [267]:
cleaning_data.replacing_value(df_cer_1_approved, '-', np.nan)

df_cer_1_approved['power_station_name_clean'] = (
    df_cer_1_approved['Power station name']
    .str.split('-').str[0]   # keep only before "-"
    .apply(cleaning_data.clean_name)  # clean each value
)
cleaning_data.show_null_data(df_cer_1_approved)


Total Duplicated Count:  0


Unnamed: 0,Null Total Count,Null Percentage
Accreditation code,0,0.0%
Power station name,0,0.0%
State,0,0.0%
Postcode,0,0.0%
Installed capacity (MW),0,0.0%
Fuel Source (s),0,0.0%
Accreditation start date,0,0.0%
Approval date,0,0.0%
power_station_name_clean,0,0.0%
Latitude,184,65.7%


In [264]:
cleaning_data.replacing_value(df_cer_1_commited, '-', np.nan)

cleaning_data.show_null_data(df_cer_1_commited)

Total Duplicated Count:  0


Unnamed: 0,Null Total Count,Null Percentage
Project Name,0,0.0%
State,0,0.0%
MW Capacity,0,0.0%
Fuel Source,0,0.0%
Committed Date (Month/Year),0,0.0%


In [255]:
cleaning_data.replacing_value(df_cer_1_probable, '-', np.nan)
cleaning_data.show_null_data(df_cer_1_probable)

Total Duplicated Count:  0


Unnamed: 0,Null Total Count,Null Percentage
Project Name,0,0.0%
State,0,0.0%
MW Capacity,0,0.0%
Fuel Source,0,0.0%


In [256]:
category_question = defaultdict(list)
for cat, q in statistical_area.columns:
    category_question[cat].append(q)
category_question = dict(category_question)

In [257]:
# only get the important rows
statistical_area = statistical_area.iloc[:-7]
lga_area = lga_area.iloc[:-7]

In [258]:
cleaning_data.replacing_value(statistical_area, '-', np.nan)

statistical_area[("Unnamed: 2_level_0", "Year")] = pd.to_numeric(statistical_area[("Unnamed: 2_level_0", "Year")], errors="coerce").astype("Int64") 
statistical_area.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Business entries - year ended 30 June,Business entries - year ended 30 June,...,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter,Estimated dwelling stock - as at June quarter
Unnamed: 0_level_1,Code,Label,Year,Number of non-employing businesses,Number of employing businesses: 1-4 employees,Number of employing businesses: 5-19 employees,Number of employing businesses: 20 or more employees,Total number of businesses,Number of non-employing business entries,Number of employing business entries: 1-4 employees,...,Houses - total (no.),Townhouses - additions (no.),Townhouses - removals (no.),Townhouses - total (no.),Apartments - additions (no.),Apartments - removals (no.),Apartments - total (no.),Total dwelling additions (no.),Total dwelling removals (no.),Total dwellings (no.)
0,AUS,Australia,2011,,,,,,,,...,,,,,,,,,,
1,AUS,Australia,2016,,,,,,,,...,,,,,,,,,,
2,AUS,Australia,2017,,,,,,,,...,7279135.0,31729.0,480.0,1261457.0,73838.0,564.0,1464325.0,218884.0,19586.0,10022775.0
3,AUS,Australia,2018,,,,,,,,...,7371408.0,33346.0,919.0,1293884.0,66330.0,605.0,1530050.0,213652.0,23181.0,10213246.0
4,AUS,Australia,2019,,,,,,,,...,7472853.0,32571.0,712.0,1325743.0,60705.0,985.0,1589770.0,215556.0,22394.0,10406408.0


In [259]:
cleaning_data.replacing_value(lga_area, '-', np.nan)

lga_area[("Unnamed: 2_level_0", "Year")] = pd.to_numeric(lga_area[("Unnamed: 2_level_0", "Year")], errors="coerce").astype("Int64") 
lga_area.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Number of businesses - at 30 June,Business entries - year ended 30 June,Business entries - year ended 30 June,...,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census,Industry of employment - Persons aged 15 years and over - Census
Unnamed: 0_level_1,Code,Label,Year,Number of non-employing businesses,Number of employing businesses: 1-4 employees,Number of employing businesses: 5-19 employees,Number of employing businesses: 20 or more employees,Total number of businesses,Number of non-employing business entries,Number of employing business entries: 1-4 employees,...,"Rental, hiring and real estate services (%)","Professional, scientific and technical services (%)",Administrative and support services (%),Public administration and safety (%),Education and training (%),Health care and social assistance (%),Arts and recreation services (%),Other services (%),Industry of employment inadequately described or not stated (%),Total persons employed aged 15 years and over (no.)
0,10050,Albury,2011,,,,,,,,...,1.3,4.5,2.8,7.5,8.7,13.7,1.0,4.0,2.1,22434.0
1,10050,Albury,2016,,,,,,,,...,1.3,4.3,3.0,7.3,9.6,15.2,0.9,4.1,3.6,22901.0
2,10050,Albury,2017,,,,,,,,...,,,,,,,,,,
3,10050,Albury,2018,,,,,,,,...,,,,,,,,,,
4,10050,Albury,2019,,,,,,,,...,,,,,,,,,,


### Data Augmentation

In [260]:
class GeoAPI:
    """Simple wrapper for Nominatim geocoding API."""
    
    BASE_URL = "https://nominatim.openstreetmap.org/search"

    def __init__(self, user_agent: str = "powerstation-geocoder"):
        self.headers = {"User-Agent": user_agent}

    def get_coordinates(self, query: str):
        """Return (latitude, longitude) for a query, or (None, None) if not found."""
        params = {"q": query, "format": "json", "limit": 1}
        
        try:
            response = requests.get(self.BASE_URL, params=params, headers=self.headers, timeout=10)
            response.raise_for_status()
            data = response.json()
            if data:
                lat, lon = float(data[0]["lat"]), float(data[0]["lon"])
                return lat, lon
        except requests.RequestException as e:
            print(f"Error geocoding '{query}': {e}")
        except Exception as err:
            print(err)
        
        return None, None
    
    import pandas as pd
import requests
import time

class GeoAPI:
    BASE_URL = "https://nominatim.openstreetmap.org/search"
    headers = {"User-Agent": "powerstation-geocoder"}

    def get_coordinates(self, query: str):
        """Return (latitude, longitude) for a query, or (None, None) if not found."""
        params = {"q": query, "format": "json", "limit": 1}
        try:
            response = requests.get(self.BASE_URL, params=params, headers=self.headers, timeout=10)
            response.raise_for_status()
            data = response.json()
            print(query, data)
            if data:
                lat, lon = float(data[0]["lat"]), float(data[0]["lon"])
                return lat, lon
        except requests.RequestException as e:
            print(f"⚠️ Error geocoding '{query}': {e}")
        except Exception as err:
            print(f"⚠️ Unexpected error for '{query}': {err}")
        return None, None

    def geocode_dataframe(self, df: pd.DataFrame, col_names, delay: float = 0):
        """
        Add latitude and longitude columns to a dataframe.

        Args:
            df: DataFrame with location columns.
            col_names: List of columns to combine into query (e.g. ['Power station name', 'State']).
            delay: Delay (in seconds) between requests to avoid API ban.
        """
        if isinstance(col_names, str):
            col_names = [col_names]  # make it a list if single column

        lats, lons = [], []
        for _, row in df.iterrows():
            # Build query by joining column values
            parts = [str(row[col]) for col in col_names if pd.notna(row[col])]
            query = ", ".join(parts)

            if not query:
                lats.append(None)
                lons.append(None)
                continue

            lat, lon = self.get_coordinates(query)
            lats.append(lat)
            lons.append(lon)

        df["Latitude"] = lats
        df["Longitude"] = lons
        return df


In [265]:
api = GeoAPI()
api.geocode_dataframe(df=df_cer_1_approved, col_names=["power_station_name_clean", "State"])



Laura Johnson Home, Townview, QLD []
Leppington, NSW [{'place_id': 19425092, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 5519705, 'lat': '-33.9650028', 'lon': '150.8011028', 'class': 'boundary', 'type': 'administrative', 'place_rank': 18, 'importance': 0.2936886391571847, 'addresstype': 'suburb', 'name': 'Leppington', 'display_name': 'Leppington, Sydney, Camden Council, New South Wales, 2179, Australia', 'boundingbox': ['-34.0043308', '-33.9483242', '150.7594078', '150.8413822']}]
Quakers Hillside Care Community, NSW [{'place_id': 18942585, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'node', 'osm_id': 2918464324, 'lat': '-33.7197167', 'lon': '150.8922198', 'class': 'highway', 'type': 'bus_stop', 'place_rank': 30, 'importance': 8.246051728079679e-05, 'addresstype': 'highway', 'name': 'Quakers Hillside Care Community, Hambledon Rd', 'display_name': 'Quakers Hillsi

Unnamed: 0,Accreditation code,Power station name,State,Postcode,Installed capacity (MW),Fuel Source (s),Accreditation start date,Approval date,power_station_name_clean,Latitude,Longitude
0,SRPXQLE8,"Laura Johnson Home, Townview - Solar w SGU - QLD",QLD,4825,0.2265,Solar,15/10/2024,13/01/2025,"Laura Johnson Home, Townview",,
1,SRPYNS39,Leppington - Solar - NSW,NSW,2179,0.7320,Solar,22/11/2024,13/01/2025,Leppington,-33.965003,150.801103
2,SRPYNS58,Quakers Hillside Care Community - Solar w SGU ...,NSW,2763,0.1996,Solar,19/12/2024,13/01/2025,Quakers Hillside Care Community,-33.719717,150.892220
3,SRPXVCN4,Rest Nominees - Solar wSGU - VIC,VIC,3008,0.1188,Solar,20/09/2024,13/01/2025,Rest Nominees,,
4,SRPXQLF9,Retail First Mt Ommaney-Solar-QLD,QLD,4074,1.0004,Solar,29/10/2024,13/01/2025,Retail First Mt Ommaney,,
...,...,...,...,...,...,...,...,...,...,...,...
275,SRPXVCT8,Liuzzi 71 Gower St - Solar - VIC,VIC,3072,0.4000,Solar,21/07/2025,27/08/2025,Liuzzi 71 Gower St,,
276,SRPXQLM5,Mercy Community - Solar w SGU - QLD,QLD,4014,0.8530,Solar,4/08/2025,27/08/2025,Mercy Community,,
277,SRPXVCP9,Norther Airfield - Solar - VIC,VIC,3045,11.2780,Solar,1/06/2025,27/08/2025,Norther Airfield,,
278,SRPVWAN8,Rose Farms - Solar wSGU- WA,WA,6220,0.2000,Solar,4/08/2025,27/08/2025,Rose Farms,47.243282,-0.700518


In [266]:
df_cer_1_approved.to_csv('output2.csv', index=False)

#### Process Creating Schema in Duck DB

In [None]:
# module related for database
database = ConnectDB()

column_name_facility = ['id','reporting_entity','facility_name','type','state',
                        'electricity_production_mwh','scope_1_emission', 
                        'scope_2_emission','emission_intesity_mwh','grid_connected',
                        'grid','primary_fuel', 'important_notes']

column_names_power_stations = ['id', 'accreditation_code', 'power_station_name','state','postcode',
                               'capacity_mw','source', 'accreditation_start_date','approval_date','commited_date','status']

