<a href="https://colab.research.google.com/github/redaxe101/MastersThesisNotebook/blob/main/UnzipNEMForecasts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## National Electricity Market Pre-processing

Pre-processes NEM zip files to extract relevant features.

Data downloaded from https://visualisations.aemo.com.au/aemo/nemweb/index.html#mms-data-model

In [None]:
from google.colab import drive
import os, zipfile, csv
from collections import defaultdict

# Mount Google Drive
drive.mount('/content/drive')

# Setup paths
drive_root = "/content/drive/MyDrive"
source_dir = os.path.join(drive_root, "NEM")
output_dir = os.path.join(source_dir, "older_pre_dispatch/output")
os.makedirs(output_dir, exist_ok=True)

# Log file to track processed zips
log_path = os.path.join(output_dir, "processed_zips.log")
processed_zips = set()
if os.path.exists(log_path):
    with open(log_path, "r") as log_file:
        processed_zips = set(line.strip() for line in log_file)

# Track headers
region_headers = []
price_headers = []
got_headers = False
got_price_headers = False

# Loop through all zip files sorted
zip_files = sorted(f for f in os.listdir(source_dir) if f.lower().endswith(".zip"))

for filename in zip_files:
    if filename in processed_zips:
        continue

    zip_path = os.path.join(source_dir, filename)
    print(f"📦 Processing: {filename}")

    run_datetime_map = {}
    rrp_map = {}
    region_data = defaultdict(list)
    rrp_data = defaultdict(list)

    try:
        with zipfile.ZipFile(zip_path, 'r') as outer:
            for nested_name in outer.namelist():
                if not nested_name.lower().endswith(".zip"):
                    continue

                with outer.open(nested_name) as nested_zip_file:
                    try:
                        with zipfile.ZipFile(nested_zip_file) as nested:
                            for csv_name in nested.namelist():
                                if not csv_name.lower().endswith(".csv"):
                                    continue

                                print(f"   → Reading CSV: {csv_name}")
                                with nested.open(csv_name) as file:
                                    decoded = (line.decode("utf-8") for line in file)
                                    reader = csv.reader(decoded)

                                    for row in reader:
                                        if not row:
                                            continue

                                        if row[0] == "I" and row[2] == "REGION_SOLUTION" and not got_headers:
                                            region_headers = ["RUN_DATETIME"] + row
                                            got_headers = True
                                            continue

                                        if row[0] == "I" and row[2] == "REGION_PRICES" and not got_price_headers:
                                            price_headers = ["RUN_DATETIME"] + row
                                            got_price_headers = True
                                            continue

                                        if row[0].startswith("C") or row[0].startswith("I"):
                                            continue

                                        if row[1] != "PREDISPATCH":
                                            continue

                                        record_type = row[2]

                                        if record_type == "CASE_SOLUTION":
                                            try:
                                                seqno = row[4]
                                                run_dt = row[-2]
                                                run_datetime_map[seqno] = run_dt
                                            except IndexError:
                                                continue

                                        elif record_type == "REGION_SOLUTION":
                                            try:
                                                seqno = row[4]
                                                region = row[6]
                                                run_dt = run_datetime_map.get(seqno, "UNKNOWN")
                                                if run_dt > "2024-06-01":
                                                  region_data[region].append([run_dt] + row)
                                            except IndexError:
                                                continue

                                        elif record_type == "REGION_PRICES":
                                            try:
                                                seqno = row[4]
                                                region = row[6]
                                                run_dt = run_datetime_map.get(seqno, "UNKNOWN")
                                                if run_dt > "2024-06-01":
                                                  rrp_data[region].append([run_dt] + row)
                                            except IndexError:
                                                continue

                    except zipfile.BadZipFile:
                        print(f"⚠️ Could not open nested ZIP: {nested_name}")
    except zipfile.BadZipFile:
        print(f"⚠️ Could not open main ZIP: {filename}")
        continue

    # Save per region
    for region, records in region_data.items():
        print(f"   → Writing {len(records)} rows for {region}")
        out_path = os.path.join(output_dir, f"{region}_pre_dispatch.csv")
        write_header = not os.path.exists(out_path)

        with open(out_path, "a", newline="") as f:
            writer = csv.writer(f)
            if write_header:
                writer.writerow(region_headers)
            writer.writerows(records)

    # Save prices per region
    for region, records in rrp_data.items():
        print(f"   → Writing {len(records)} pricing rows for {region}")
        out_path = os.path.join(output_dir, f"{region}_prices.csv")
        write_header = not os.path.exists(out_path)

        with open(out_path, "a", newline="") as f:
            writer = csv.writer(f)
            if write_header:
                writer.writerow(price_headers)
            writer.writerows(records)

    # Log the processed zip
    with open(log_path, "a") as log_file:
        log_file.write(filename + "\n")

print("✅ All ZIPs processed.")


Mounted at /content/drive
✅ All ZIPs processed.
