In [2]:
import os
import requests
from bs4 import BeautifulSoup
import zipfile
import io
import pandas as pd

In [41]:
# --- Config ---
BASE_URL = "https://nemweb.com.au/Reports/Archive/DispatchIS_Reports/"
DOWNLOAD_DIR = "../data/dispatchis_zips"
EXTRACT_DIR = "../data/dispatchis_csvs"
EXTRACT_INNER_ZIPS = True  # Whether to also unzip the inner zip files

# --- Create folders ---
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs(EXTRACT_DIR, exist_ok=True)

In [5]:
# --- Get the webpage ---
print(f"Fetching file list from {BASE_URL}...")
response = requests.get(BASE_URL)
soup = BeautifulSoup(response.content, "html.parser")

# --- Find all .zip links ---
links = [
    link.get("href")
    for link in soup.find_all("a")
    if link.get("href", "").endswith(".zip")
]

print(f"Found {len(links)} zip files to download.")

Fetching file list from https://nemweb.com.au/Reports/Archive/DispatchIS_Reports/...
Found 395 zip files to download.


In [8]:
# --- Download all zips ---
for link in links:
    file_url = "https://nemweb.com.au" + link
    local_zip_path = os.path.join(DOWNLOAD_DIR, os.path.basename(link))

    if os.path.exists(local_zip_path):
        print(f"Already downloaded {os.path.basename(link)}, skipping...")
        continue

    print(f"Downloading {os.path.basename(link)}...")
    with requests.get(file_url, stream=True) as r:
        r.raise_for_status()
        with open(local_zip_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

print("All zip files downloaded.")

Downloading PUBLIC_DISPATCHIS_20240326.zip...
Downloading PUBLIC_DISPATCHIS_20240327.zip...
Downloading PUBLIC_DISPATCHIS_20240328.zip...
Downloading PUBLIC_DISPATCHIS_20240329.zip...
Downloading PUBLIC_DISPATCHIS_20240330.zip...
Downloading PUBLIC_DISPATCHIS_20240331.zip...
Downloading PUBLIC_DISPATCHIS_20240401.zip...
Downloading PUBLIC_DISPATCHIS_20240402.zip...
Downloading PUBLIC_DISPATCHIS_20240403.zip...
Downloading PUBLIC_DISPATCHIS_20240404.zip...
Downloading PUBLIC_DISPATCHIS_20240405.zip...
Downloading PUBLIC_DISPATCHIS_20240406.zip...
Downloading PUBLIC_DISPATCHIS_20240407.zip...
Downloading PUBLIC_DISPATCHIS_20240408.zip...
Downloading PUBLIC_DISPATCHIS_20240409.zip...
Downloading PUBLIC_DISPATCHIS_20240410.zip...
Downloading PUBLIC_DISPATCHIS_20240411.zip...
Downloading PUBLIC_DISPATCHIS_20240412.zip...
Downloading PUBLIC_DISPATCHIS_20240413.zip...
Downloading PUBLIC_DISPATCHIS_20240414.zip...
Downloading PUBLIC_DISPATCHIS_20240415.zip...
Downloading PUBLIC_DISPATCHIS_2024

In [14]:
# --- Extract ZIP files ---
print("Extracting zip files...")
for zip_filename in os.listdir(DOWNLOAD_DIR):
    print(f"Processing {zip_filename}...")
    zip_path = os.path.join(DOWNLOAD_DIR, zip_filename)

    with zipfile.ZipFile(zip_path, "r") as outer_zip:
        for inner_zip_name in outer_zip.namelist():
            if EXTRACT_INNER_ZIPS and inner_zip_name.endswith(".zip"):
                # Read inner zip into memory
                with outer_zip.open(inner_zip_name) as inner_zip_file:
                    inner_zip_bytes = io.BytesIO(inner_zip_file.read())
                    with zipfile.ZipFile(inner_zip_bytes) as inner_zip:
                        inner_zip.extractall(EXTRACT_DIR)
            else:
                outer_zip.extract(inner_zip_name, path=EXTRACT_DIR)

print("All files extracted!")

Extracting zip files...
Processing PUBLIC_DISPATCHIS_20240326.zip...
Processing PUBLIC_DISPATCHIS_20240327.zip...
Processing PUBLIC_DISPATCHIS_20240328.zip...
Processing PUBLIC_DISPATCHIS_20240329.zip...
Processing PUBLIC_DISPATCHIS_20240330.zip...
Processing PUBLIC_DISPATCHIS_20240331.zip...
Processing PUBLIC_DISPATCHIS_20240401.zip...
Processing PUBLIC_DISPATCHIS_20240402.zip...
Processing PUBLIC_DISPATCHIS_20240403.zip...
Processing PUBLIC_DISPATCHIS_20240404.zip...
Processing PUBLIC_DISPATCHIS_20240405.zip...
Processing PUBLIC_DISPATCHIS_20240406.zip...
Processing PUBLIC_DISPATCHIS_20240407.zip...
Processing PUBLIC_DISPATCHIS_20240408.zip...
Processing PUBLIC_DISPATCHIS_20240409.zip...
Processing PUBLIC_DISPATCHIS_20240410.zip...
Processing PUBLIC_DISPATCHIS_20240411.zip...
Processing PUBLIC_DISPATCHIS_20240412.zip...
Processing PUBLIC_DISPATCHIS_20240413.zip...
Processing PUBLIC_DISPATCHIS_20240414.zip...
Processing PUBLIC_DISPATCHIS_20240415.zip...
Processing PUBLIC_DISPATCHIS_20

In [19]:
x = 395 * 288
print(
    len(os.listdir(EXTRACT_DIR)),
    "files extracted to",
    EXTRACT_DIR,
    f"\nCorrect expected number of files: {x == len(os.listdir(EXTRACT_DIR))}",
)

113760 files extracted to ../data/dispatchis_csvs 
Correct expected number of files: True


In [4]:
EXTRACT_DIR = "../data/dispatchis_csvs"
OUTPUT_CSV = "../data/nem_ugif_interchange_data.csv"
COLUMNS_TO_KEEP = [
    "SETTLEMENTDATE",
    "REGIONID",
    "SS_SOLAR_UIGF",
    "SS_WIND_UIGF",
    "NETINTERCHANGE",
]
REGIONS_OF_INTEREST = ["NSW1", "QLD1", "SA1", "TAS1", "VIC1"]

df_list = []

for filename in os.listdir(EXTRACT_DIR):
    if filename.endswith(".CSV"):
        print(f"Processing {filename}...")
        file_path = os.path.join(EXTRACT_DIR, filename)
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                lines = f.readlines()

            # Find the line that has "REGIONSUM" (i.e., the header)
            for i, line in enumerate(lines):
                if "REGIONSUM" in line:
                    header_line = lines[i].strip().split(",")
                    data_lines = lines[i + 1 :]

                    # Load this block into a DataFrame
                    df_block = pd.DataFrame(
                        [l.strip().split(",") for l in data_lines], columns=header_line
                    )

                    # Filter for valid REGIONID
                    df_block = df_block[
                        df_block.get("REGIONID", "").isin(REGIONS_OF_INTEREST)
                    ]

                    # Check and keep only required columns
                    if all(col in df_block.columns for col in COLUMNS_TO_KEEP):
                        df_selected = df_block[COLUMNS_TO_KEEP]
                        df_list.append(df_selected)
                    break  # stop after first REGIONSUM block

        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Combine and save
if df_list:
    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved combined UIGF data to {OUTPUT_CSV}")
else:
    print("No valid data found.")

Processing PUBLIC_DISPATCHIS_202403260005_0000000414673000.CSV...
Processing PUBLIC_DISPATCHIS_202403260010_0000000414673264.CSV...
Processing PUBLIC_DISPATCHIS_202403260015_0000000414673557.CSV...
Processing PUBLIC_DISPATCHIS_202403260020_0000000414673801.CSV...
Processing PUBLIC_DISPATCHIS_202403260025_0000000414674068.CSV...
Processing PUBLIC_DISPATCHIS_202403260030_0000000414674310.CSV...
Processing PUBLIC_DISPATCHIS_202403260035_0000000414674599.CSV...
Processing PUBLIC_DISPATCHIS_202403260040_0000000414674879.CSV...
Processing PUBLIC_DISPATCHIS_202403260045_0000000414675154.CSV...
Processing PUBLIC_DISPATCHIS_202403260050_0000000414675381.CSV...
Processing PUBLIC_DISPATCHIS_202403260055_0000000414675647.CSV...
Processing PUBLIC_DISPATCHIS_202403260100_0000000414675885.CSV...
Processing PUBLIC_DISPATCHIS_202403260105_0000000414676190.CSV...
Processing PUBLIC_DISPATCHIS_202403260110_0000000414676530.CSV...
Processing PUBLIC_DISPATCHIS_202403260115_0000000414676823.CSV...
Processing

In [5]:
print(len(df_list), "files processed")
test_df = df_list[0]
display(test_df.head())
df_len = [len(df) for df in df_list]
set_df_len = set(df_len)
print("Unique lengths of DataFrames:", set_df_len)

113760 files processed


Unnamed: 0,SETTLEMENTDATE,REGIONID,SS_SOLAR_UIGF,SS_WIND_UIGF,NETINTERCHANGE
0,"""2024/03/26 00:05:00""",NSW1,0,270.18949,49.65
1,"""2024/03/26 00:05:00""",QLD1,0,293.74,117.92
2,"""2024/03/26 00:05:00""",SA1,0,423.69867,-571.16
3,"""2024/03/26 00:05:00""",TAS1,0,276.74765,-100.33
4,"""2024/03/26 00:05:00""",VIC1,0,667.19433,549.11


Unique lengths of DataFrames: {10, 5}


In [7]:
# Find one example of a df with 10 rows
print(df_len.count(10))
last = 0
for i, df in enumerate(df_list):
    if len(df) == 10:
        print(f"Index {i}")
        df_list[i] = df.drop_duplicates(subset=["SETTLEMENTDATE", "REGIONID"])

12
Index 71037
Index 71038
Index 71039
Index 71040
Index 71041
Index 71042
Index 71043
Index 71044
Index 71045
Index 71046
Index 71047
Index 71048


In [8]:
df_len_updated = [len(df) for df in df_list]
set_df_len_updated = set(df_len_updated)
print(df_len_updated.count(10))
print("Unique lengths of DataFrames after dropping duplicates:", set_df_len_updated)

0
Unique lengths of DataFrames after dropping duplicates: {5}


In [9]:
combined_df = pd.concat(df_list, ignore_index=True)
combined_df.shape
expected_rows = 395 * 288 * 5
print("Expected rows:", expected_rows)
print("Actual rows:", combined_df.shape[0])
print(f"Expected = Actual: {expected_rows == combined_df.shape[0]}")

Expected rows: 568800
Actual rows: 568800
Expected = Actual: True


In [10]:
combined_df.head()

Unnamed: 0,SETTLEMENTDATE,REGIONID,SS_SOLAR_UIGF,SS_WIND_UIGF,NETINTERCHANGE
0,"""2024/03/26 00:05:00""",NSW1,0,270.18949,49.65
1,"""2024/03/26 00:05:00""",QLD1,0,293.74,117.92
2,"""2024/03/26 00:05:00""",SA1,0,423.69867,-571.16
3,"""2024/03/26 00:05:00""",TAS1,0,276.74765,-100.33
4,"""2024/03/26 00:05:00""",VIC1,0,667.19433,549.11


In [11]:
combined_df["SETTLEMENTDATE"] = combined_df["SETTLEMENTDATE"].str.strip('"')
combined_df["SETTLEMENTDATE"] = pd.to_datetime(
    combined_df["SETTLEMENTDATE"], format="%Y/%m/%d %H:%M:%S"
)
combined_df.head()

Unnamed: 0,SETTLEMENTDATE,REGIONID,SS_SOLAR_UIGF,SS_WIND_UIGF,NETINTERCHANGE
0,2024-03-26 00:05:00,NSW1,0,270.18949,49.65
1,2024-03-26 00:05:00,QLD1,0,293.74,117.92
2,2024-03-26 00:05:00,SA1,0,423.69867,-571.16
3,2024-03-26 00:05:00,TAS1,0,276.74765,-100.33
4,2024-03-26 00:05:00,VIC1,0,667.19433,549.11


In [12]:
combined_df.isnull().sum()

SETTLEMENTDATE    0
REGIONID          0
SS_SOLAR_UIGF     0
SS_WIND_UIGF      0
NETINTERCHANGE    0
dtype: int64

In [13]:
expected_regions = ["NSW1", "QLD1", "SA1", "TAS1", "VIC1"]

# Check 1: Group by SETTLEMENTDATE and count the number of regions
region_counts = combined_df.groupby("SETTLEMENTDATE")["REGIONID"].nunique()

# Find timestamps where the number of regions is not as expected
missing_regions_times = region_counts[region_counts != len(expected_regions)]

print(f"Number of timestamps with missing regions: {missing_regions_times.shape[0]}")
print(missing_regions_times)

Number of timestamps with missing regions: 0
Series([], Name: REGIONID, dtype: int64)


In [14]:
expected_count = 395 * 288
region_counts = combined_df["REGIONID"].value_counts()
print(region_counts)
print("Expected count for each region:", expected_count)
print("All regions have the expected count:", all(region_counts == expected_count))

REGIONID
NSW1    113760
QLD1    113760
SA1     113760
TAS1    113760
VIC1    113760
Name: count, dtype: int64
Expected count for each region: 113760
All regions have the expected count: True


In [15]:
combined_df.to_csv(OUTPUT_CSV, index=False)