In [1]:
import os
import re
import time
import pathlib
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

BASE_LIST_URL = "https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm"
OUT_DIR = pathlib.Path("downloads")           # where to save .txt files
PER_REQUEST_DELAY_SEC = 0.8                   # polite delay between requests
TIMEOUT = 30                                  # seconds
HEADERS = {"User-Agent": "Mozilla/5.0 (data-collection; contact: you@example.com)"}

# --- Helpers -----------------------------------------------------------------

def get_html(session: requests.Session, url: str) -> str:
    """Fetch HTML with basic error handling."""
    resp = session.get(url, headers=HEADERS, timeout=TIMEOUT)
    resp.raise_for_status()
    return resp.text

def find_publish_links(list_html: str, base_url: str) -> list[str]:
    """
    From the station list page (stage.cfm), return absolute URLs to publish pages like:
      /waterinformation/wir/reports/publish/601001/601001.htm
    """
    soup = BeautifulSoup(list_html, "html.parser")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        # Look for the publish pattern
        if re.search(r"/waterinformation/wir/reports/publish/\d{6}/\d{6}\.htm$", href):
            links.append(urljoin(base_url, href))
    # Deduplicate while preserving order
    seen = set()
    out = []
    for u in links:
        if u not in seen:
            seen.add(u)
            out.append(u)
    return out

def extract_site_id_from_url(url: str) -> str | None:
    m = re.search(r"/publish/(\d{6})/\1\.htm$", url)
    return m.group(1) if m else None

def find_last6days_stage_txt_url(publish_html: str, base_url: str) -> str | None:
    """
    On a publish page, find the anchor for:
    'Text Data File of Last 6 days of Stage Data' (case-insensitive).
    Fallback: find first .txt link that looks like Stage.
    """
    soup = BeautifulSoup(publish_html, "html.parser")
    # Primary: match the exact wording in a case-insensitive way
    for a in soup.find_all("a", href=True):
        text = (a.get_text(strip=True) or "").lower()
        if "text data file" in text and "last 6 days" in text and "stage" in text:
            return urljoin(base_url, a["href"])

    # Fallback: first .txt that mentions 'stage' or 'zipstage' in href or text
    for a in soup.find_all("a", href=True):
        href_l = a["href"].lower()
        text_l = (a.get_text(strip=True) or "").lower()
        if href_l.endswith(".txt") and ("stage" in href_l or "stage" in text_l or "zipstage" in href_l):
            return urljoin(base_url, a["href"])

    return None

def download_file(session: requests.Session, url: str, out_path: pathlib.Path) -> None:
    """Stream download to disk."""
    with session.get(url, headers=HEADERS, timeout=TIMEOUT, stream=True) as r:
        r.raise_for_status()
        out_path.parent.mkdir(parents=True, exist_ok=True)
        with open(out_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

# --- Main workflow ------------------------------------------------------------

def crawl_all_last6days_stage_txt():
    session = requests.Session()

    print(f"[1/3] Fetching station list: {BASE_LIST_URL}")
    list_html = get_html(session, BASE_LIST_URL)
    publish_pages = find_publish_links(list_html, BASE_LIST_URL)
    print(f"    Found {len(publish_pages)} publish pages")

    downloaded = 0
    skipped = 0

    for i, pub_url in enumerate(publish_pages, start=1):
        site_id = extract_site_id_from_url(pub_url) or f"site{i:03d}"
        print(f"[2/3] ({i}/{len(publish_pages)}) Opening publish page for site {site_id}: {pub_url}")

        try:
            pub_html = get_html(session, pub_url)
        except Exception as e:
            print(f"    !! Failed to open publish page: {e}")
            skipped += 1
            time.sleep(PER_REQUEST_DELAY_SEC)
            continue

        txt_url = find_last6days_stage_txt_url(pub_html, pub_url)
        if not txt_url:
            print("    !! No 'Last 6 days of Stage Data' .txt link found on this page")
            skipped += 1
            time.sleep(PER_REQUEST_DELAY_SEC)
            continue

        # Name like 601001_stage_last6days.txt (preserve .txt)
        out_name = f"{site_id}_stage_last6days.txt"
        out_path = OUT_DIR / out_name
        print(f"[3/3] Downloading: {txt_url}  ->  {out_path}")

        try:
            download_file(session, txt_url, out_path)
            print("    OK")
            downloaded += 1
        except Exception as e:
            print(f"    !! Download failed: {e}")
            skipped += 1

        time.sleep(PER_REQUEST_DELAY_SEC)

    print(f"\nDone. Downloaded: {downloaded}, Skipped: {skipped}. Saved to: {OUT_DIR.resolve()}")

if __name__ == "__main__":
    crawl_all_last6days_stage_txt()

[1/3] Fetching station list: https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm
    Found 244 publish pages
[2/3] (1/244) Opening publish page for site 601001: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601001/601001.htm
[3/3] Downloading: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601001/zipstage.zip  ->  downloads/601001_stage_last6days.txt
    OK
[2/3] (2/244) Opening publish page for site 601004: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601004/601004.htm
[3/3] Downloading: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601004/zipstage.zip  ->  downloads/601004_stage_last6days.txt
    OK
[2/3] (3/244) Opening publish page for site 601005: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601005/601005.htm
[3/3] Downloading: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601005/zipstage.zip  ->  downloads/601005_stage_last6days.txt
    

In [4]:
# test_one_file.py
# Downloads ONE station's "Text Data File of Last 6 days of Stage Data"
# from https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm,
# auto-detects ZIP/GZIP/plain text, extracts to .txt, and prints first lines.

import io
import re
import gzip
import time
import zipfile
import pathlib
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

BASE_LIST_URL = "https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm"
OUT_DIR = pathlib.Path("downloads_test")
HEADERS = {"User-Agent": "Mozilla/5.0 (data-collection; contact: you@example.com)"}
TIMEOUT = 30
REQUEST_DELAY_SEC = 0.8  # polite delay

def get_html(session: requests.Session, url: str) -> str:
    """Fetch HTML with basic error handling."""
    r = session.get(url, headers=HEADERS, timeout=TIMEOUT)
    r.raise_for_status()
    return r.text

def find_first_publish_link(list_html: str, base_url: str) -> str | None:
    """
    From the station list page, find the first publish page link like:
      /waterinformation/wir/reports/publish/601001/601001.htm
    Returns absolute URL or None.
    """
    soup = BeautifulSoup(list_html, "html.parser")
    # strict pattern first
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if re.search(r"/waterinformation/wir/reports/publish/\d{6}/\d{6}\.htm$", href):
            return urljoin(base_url, href)
    # fallback: any publish link containing a 6-digit site id
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "/waterinformation/wir/reports/publish/" in href and re.search(r"/(\d{6})/", href):
            return urljoin(base_url, href)
    return None

def extract_site_id_from_publish_url(url: str) -> str | None:
    m = re.search(r"/publish/(\d{6})/\1\.htm$", url)
    if m:
        return m.group(1)
    m2 = re.search(r"/publish/(\d{6})/", url)
    return m2.group(1) if m2 else None

def find_stage_txt_url(publish_html: str, base_url: str) -> str | None:
    """
    On a publish page, find the anchor for:
      "Text Data File of Last 6 days of Stage Data" (case-insensitive).
    Fallback: first .txt link that looks like stage/zipstage.
    Returns absolute URL or None.
    """
    soup = BeautifulSoup(publish_html, "html.parser")
    # primary: semantic match on the link text
    for a in soup.find_all("a", href=True):
        text = (a.get_text(strip=True) or "").lower()
        if ("text data file" in text) and ("last 6 days" in text) and ("stage" in text):
            return urljoin(base_url, a["href"])
    # fallback: any .txt mentioning stage
    for a in soup.find_all("a", href=True):
        href_l = a["href"].lower()
        text_l = (a.get_text(strip=True) or "").lower()
        if href_l.endswith(".txt") and ("stage" in href_l or "stage" in text_l or "zipstage" in href_l):
            return urljoin(base_url, a["href"])
    return None

def download_and_extract_txt(session: requests.Session, url: str, out_txt_path: pathlib.Path) -> None:
    """
    Download and detect ZIP/GZIP/plain. Always write a real .txt to disk.
    """
    with session.get(url, headers=HEADERS, timeout=TIMEOUT, stream=True) as r:
        r.raise_for_status()
        content = b"".join(r.iter_content(8192))

    is_zip = content[:2] == b"PK"
    is_gzip = content[:2] == b"\x1f\x8b"

    out_txt_path.parent.mkdir(parents=True, exist_ok=True)

    if is_zip:
        with zipfile.ZipFile(io.BytesIO(content)) as zf:
            names = zf.namelist()
            # prefer the largest .txt file in the archive
            txts = [n for n in names if n.lower().endswith(".txt")]
            chosen = max((txts or names), key=lambda n: zf.getinfo(n).file_size)
            data = zf.read(chosen)
        out_txt_path.write_bytes(data)
    elif is_gzip:
        data = gzip.decompress(content)
        out_txt_path.write_bytes(data)
    else:
        # likely plain text (could be HTML error page in edge cases)
        out_txt_path.write_bytes(content)

def print_head_lines(path: pathlib.Path, n: int = 20) -> None:
    """
    Print the first N lines safely. Tries UTF-8 first, falls back to cp1252.
    """
    raw = path.read_bytes()
    try:
        txt = raw.decode("utf-8-sig", errors="strict")
    except UnicodeDecodeError:
        txt = raw.decode("cp1252", errors="replace")
    lines = txt.splitlines()
    take = lines[:n]
    print("\n--- File head (first {} lines) ---".format(min(n, len(lines))))
    for i, line in enumerate(take, 1):
        print(f"{i:02d}: {line}")
    print("--- end head ---\n")

def test_one_file() -> None:
    session = requests.Session()

    print(f"[1/4] Fetching station list: {BASE_LIST_URL}")
    list_html = get_html(session, BASE_LIST_URL)
    time.sleep(REQUEST_DELAY_SEC)

    print("[2/4] Finding the first publish page link...")
    publish_url = find_first_publish_link(list_html, BASE_LIST_URL)
    if not publish_url:
        raise RuntimeError("Could not find a publish page link on the list page.")
    site_id = extract_site_id_from_publish_url(publish_url) or "unknown_site"
    print(f"     -> publish page: {publish_url} (site {site_id})")
    time.sleep(REQUEST_DELAY_SEC)

    print("[3/4] Opening publish page and locating the Stage TXT link...")
    publish_html = get_html(session, publish_url)
    txt_url = find_stage_txt_url(publish_html, publish_url)
    if not txt_url:
        raise RuntimeError("Could not find the 'Text Data File of Last 6 days of Stage Data' link.")
    print(f"     -> txt download URL: {txt_url}")
    time.sleep(REQUEST_DELAY_SEC)

    OUT_DIR.mkdir(parents=True, exist_ok=True)
    out_path = OUT_DIR / f"{site_id}_stage_last6days.txt"

    print(f"[4/4] Downloading and extracting to: {out_path.resolve()}")
    download_and_extract_txt(session, txt_url, out_path)

    print("Done. Verifying file contents:")
    print_head_lines(out_path, n=20)
    print("✅ If the head looks like numeric CSV-ish rows (not binary/garbled), the download is valid.")

if __name__ == "__main__":
    test_one_file()

[1/4] Fetching station list: https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm
[2/4] Finding the first publish page link...
     -> publish page: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601001/601001.htm (site 601001)
[3/4] Opening publish page and locating the Stage TXT link...
     -> txt download URL: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601001/zipstage.zip
[4/4] Downloading and extracting to: /Users/babyll/Desktop/respondaa/responda/py_scripts/downloads_test/601001_stage_last6days.txt
Done. Verifying file contents:

--- File head (first 20 lines) ---
01: 601001  ,10.30,103,00:00:00,21/08/2025, 10.548,150
02: 601001  ,10.30,103,01:00:00,21/08/2025, 10.548,150
03: 601001  ,10.30,103,02:00:00,21/08/2025, 10.549,150
04: 601001  ,10.30,103,03:00:00,21/08/2025, 10.549,150
05: 601001  ,10.30,103,04:00:00,21/08/2025, 10.549,150
06: 601001  ,10.30,103,05:00:00,21/08/2025, 10.549,150
07: 601001  ,10.30,103,06:00:00,21/0

In [5]:
# crawl_all_stage_last6days.py
# Crawl all stations from stage.cfm -> publish pages -> download "Text Data File of Last 6 days of Stage Data"
# Auto-detect ZIP/GZIP/plain, extract to .txt, parse to station CSV, and build a combined master CSV.

import io
import re
import gzip
import time
import zipfile
import pathlib
from typing import List, Optional
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import pandas as pd

# ------------------- Config -------------------
BASE_LIST_URL = "https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm"
OUT_DIR = pathlib.Path("downloads_stage_last6days")  # where station .txt and .csv files go
MASTER_CSV = OUT_DIR / "all_stations_last6days.csv"  # combined CSV
HEADERS = {"User-Agent": "Mozilla/5.0 (data-collection; contact: you@example.com)"}
TIMEOUT = 30
REQUEST_DELAY_SEC = 0.8   # polite delay between requests
PARSE_TO_CSV = True       # set False to only download .txt
# ----------------------------------------------


# --------- HTTP helpers ---------
def get_html(session: requests.Session, url: str) -> str:
    r = session.get(url, headers=HEADERS, timeout=TIMEOUT)
    r.raise_for_status()
    return r.text

def download_and_extract_txt(session: requests.Session, url: str, out_txt_path: pathlib.Path) -> None:
    """
    Download and detect ZIP/GZIP/plain. Always write a real .txt to disk.
    """
    with session.get(url, headers=HEADERS, timeout=TIMEOUT, stream=True) as r:
        r.raise_for_status()
        content = b"".join(r.iter_content(8192))

    is_zip = content[:2] == b"PK"
    is_gzip = content[:2] == b"\x1f\x8b"

    out_txt_path.parent.mkdir(parents=True, exist_ok=True)

    if is_zip:
        with zipfile.ZipFile(io.BytesIO(content)) as zf:
            names = zf.namelist()
            # prefer the largest .txt in the archive
            txts = [n for n in names if n.lower().endswith(".txt")]
            chosen = max((txts or names), key=lambda n: zf.getinfo(n).file_size)
            data = zf.read(chosen)
        out_txt_path.write_bytes(data)
    elif is_gzip:
        data = gzip.decompress(content)
        out_txt_path.write_bytes(data)
    else:
        out_txt_path.write_bytes(content)


# --------- List & publish page parsing ---------
def find_publish_links(list_html: str, base_url: str) -> List[str]:
    """
    From station list page, return absolute URLs to publish pages like:
      /waterinformation/wir/reports/publish/601001/601001.htm
    """
    soup = BeautifulSoup(list_html, "html.parser")
    links = []

    # strict first
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if re.search(r"/waterinformation/wir/reports/publish/\d{6}/\d{6}\.htm$", href):
            links.append(urljoin(base_url, href))

    # fallback (if DOM changes)
    if not links:
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if "/waterinformation/wir/reports/publish/" in href and re.search(r"/(\d{6})/", href):
                links.append(urljoin(base_url, href))

    # dedupe preserve order
    seen, out = set(), []
    for u in links:
        if u not in seen:
            seen.add(u)
            out.append(u)
    return out

def extract_site_id_from_publish_url(url: str) -> Optional[str]:
    m = re.search(r"/publish/(\d{6})/\1\.htm$", url)
    if m:
        return m.group(1)
    m2 = re.search(r"/publish/(\d{6})/", url)
    return m2.group(1) if m2 else None

def find_stage_txt_url(publish_html: str, base_url: str) -> Optional[str]:
    """
    On a publish page, find:
      "Text Data File of Last 6 days of Stage Data" (case-insensitive).
    Fallback: the first .txt link that looks like stage/zipstage.
    """
    soup = BeautifulSoup(publish_html, "html.parser")

    # primary text match
    for a in soup.find_all("a", href=True):
        text = (a.get_text(strip=True) or "").lower()
        if ("text data file" in text) and ("last 6 days" in text) and ("stage" in text):
            return urljoin(base_url, a["href"])

    # fallback: any .txt mentioning stage
    for a in soup.find_all("a", href=True):
        href_l = a["href"].lower()
        text_l = (a.get_text(strip=True) or "").lower()
        if href_l.endswith(".txt") and ("stage" in href_l or "stage" in text_l or "zipstage" in href_l):
            return urljoin(base_url, a["href"])

    return None


# --------- TXT -> CSV parsing ---------
def parse_stage_txt_to_df(txt_path: pathlib.Path, override_site_id: Optional[str] = None) -> pd.DataFrame:
    """
    Parse lines like:
        601001  ,10.30,103,00:00:00,21/08/2025, 10.548,150
    into a DataFrame with columns:
        Datetime, Stage, Discharge, SiteID, VarFrom, VarTo
    """
    # read with forgiving separator (comma with optional surrounding spaces)
    df = pd.read_csv(txt_path, header=None, sep=r"\s*,\s*", engine="python", dtype=str, na_filter=False)

    # normalize to 7 columns (most common layout)
    while df.shape[1] < 7:
        df[df.shape[1]] = None
    df = df.iloc[:, :7]
    df.columns = ["SiteID", "VarFrom", "VarTo", "Time", "Date", "Stage", "Discharge"]

    # clean numeric Stage/Discharge (remove trailing P etc.)
    for col in ["Stage", "Discharge"]:
        df[col] = (
            df[col].astype(str)
            .str.replace(r"[^0-9\.\-]", "", regex=True)
            .replace({"": None})
        )
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # build Datetime from Date + Time (support multiple formats)
    def parse_dt(date_str: str, time_str: str) -> pd.Timestamp:
        s = f"{str(date_str).strip()} {str(time_str).strip()}"
        # try mixed formats (dayfirst and explicit patterns)
        for dayfirst in (True, False):
            try:
                return pd.to_datetime(s, dayfirst=dayfirst, errors="raise")
            except Exception:
                pass
        for fmt in ("%d-%b-%y %H:%M:%S", "%d-%b-%Y %H:%M:%S", "%d/%m/%Y %H:%M:%S", "%Y-%m-%d %H:%M:%S"):
            try:
                return pd.to_datetime(s, format=fmt, errors="raise")
            except Exception:
                pass
        return pd.NaT

    df["Datetime"] = [
        parse_dt(d, t) for d, t in zip(df["Date"], df["Time"])
    ]
    df = df.dropna(subset=["Datetime"]).sort_values("Datetime")

    # override SiteID if we know it from URL
    if override_site_id:
        df["SiteID"] = override_site_id

    # final column order
    out = df[["Datetime", "Stage", "Discharge", "SiteID", "VarFrom", "VarTo"]].copy()
    return out

def save_df_to_csv(df: pd.DataFrame, csv_path: pathlib.Path) -> None:
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(csv_path, index=False, encoding="utf-8")


# --------- Orchestrator ---------
def crawl_all() -> None:
    session = requests.Session()

    print(f"[1/5] Fetch station list: {BASE_LIST_URL}")
    list_html = get_html(session, BASE_LIST_URL)
    publish_pages = find_publish_links(list_html, BASE_LIST_URL)
    print(f"    Found {len(publish_pages)} publish pages")

    OUT_DIR.mkdir(parents=True, exist_ok=True)

    master_frames = []
    downloaded, parsed, skipped = 0, 0, 0

    for i, pub_url in enumerate(publish_pages, start=1):
        site_id = extract_site_id_from_publish_url(pub_url) or f"site{i:03d}"
        print(f"[2/5] ({i}/{len(publish_pages)}) Open publish page for site {site_id}")
        try:
            pub_html = get_html(session, pub_url)
        except Exception as e:
            print(f"    !! Failed to open publish page: {e}")
            skipped += 1
            time.sleep(REQUEST_DELAY_SEC)
            continue

        txt_url = find_stage_txt_url(pub_html, pub_url)
        if not txt_url:
            print("    !! No 'Text Data File of Last 6 days of Stage Data' link found")
            skipped += 1
            time.sleep(REQUEST_DELAY_SEC)
            continue

        txt_path = OUT_DIR / f"{site_id}_stage_last6days.txt"
        csv_path = OUT_DIR / f"{site_id}_stage_last6days.csv"

        print(f"[3/5] Download + extract: {txt_url} -> {txt_path.name}")
        try:
            download_and_extract_txt(session, txt_url, txt_path)
            downloaded += 1
        except Exception as e:
            print(f"    !! Download failed: {e}")
            skipped += 1
            time.sleep(REQUEST_DELAY_SEC)
            continue

        if PARSE_TO_CSV:
            print(f"[4/5] Parse to CSV -> {csv_path.name}")
            try:
                df = parse_stage_txt_to_df(txt_path, override_site_id=site_id)
                save_df_to_csv(df, csv_path)
                master_frames.append(df)
                parsed += 1
            except Exception as e:
                print(f"    !! Parse failed: {e}")

        time.sleep(REQUEST_DELAY_SEC)

    if PARSE_TO_CSV and master_frames:
        print(f"[5/5] Writing combined master CSV -> {MASTER_CSV.name}")
        master = pd.concat(master_frames, ignore_index=True)
        save_df_to_csv(master, MASTER_CSV)

    print(f"\nDone. Downloaded: {downloaded}, Parsed: {parsed}, Skipped: {skipped}.")
    print(f"Output folder: {OUT_DIR.resolve()}")
    if PARSE_TO_CSV:
        print(f"Master CSV: {MASTER_CSV.resolve()}")

if __name__ == "__main__":
    crawl_all()

[1/5] Fetch station list: https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm
    Found 244 publish pages
[2/5] (1/244) Open publish page for site 601001
[3/5] Download + extract: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601001/zipstage.zip -> 601001_stage_last6days.txt
[4/5] Parse to CSV -> 601001_stage_last6days.csv
[2/5] (2/244) Open publish page for site 601004
[3/5] Download + extract: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601004/zipstage.zip -> 601004_stage_last6days.txt
[4/5] Parse to CSV -> 601004_stage_last6days.csv
[2/5] (3/244) Open publish page for site 601005
[3/5] Download + extract: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601005/zipstage.zip -> 601005_stage_last6days.txt
[4/5] Parse to CSV -> 601005_stage_last6days.csv
[2/5] (4/244) Open publish page for site 601008
[3/5] Download + extract: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601008/zipstage.z

In [6]:
# Crawl all site IDs from stage.cfm (with pagination), then fetch each Station Summary (sitesum.htm)
# Extract: site, river, station_name, latitude, longitude -> write a single CSV.

import re
import time
import math
import pathlib
from typing import List, Optional, Tuple
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_LIST_URL = "https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm"
SUMMARY_URL_TMPL = "https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/{site}/sitesum.htm"

OUT_CSV = pathlib.Path("stations_summary.csv")
HEADERS = {"User-Agent": "Mozilla/5.0 (data-collection; contact: you@example.com)"}
TIMEOUT = 30
REQUEST_DELAY_SEC = 0.7  # be polite

# ---------- HTTP helpers ----------
def get_html(session: requests.Session, url: str) -> str:
    r = session.get(url, headers=HEADERS, timeout=TIMEOUT)
    r.raise_for_status()
    return r.text

# ---------- Parsing helpers ----------
SITE_ID_RE = re.compile(r"\b(\d{6})\b")

def find_site_ids_on_list_page(html: str, base_url: str) -> List[str]:
    """Collect all 6-digit site IDs from any <a href> on the list page."""
    soup = BeautifulSoup(html, "html.parser")
    site_ids = set()

    for a in soup.find_all("a", href=True):
        href = a["href"]
        # Look inside link text and URL for 6-digit codes
        for txt in (href, a.get_text(" ", strip=True)):
            m = SITE_ID_RE.search(txt or "")
            if m:
                site_ids.add(m.group(1))

        # Some rows may link to publish pages like .../publish/601001/601001.htm
        m2 = re.search(r"/publish/(\d{6})/", href or "")
        if m2:
            site_ids.add(m2.group(1))

    return sorted(site_ids)

def find_next_page_url(html: str, base_url: str) -> Optional[str]:
    """Try to find a 'Next' link to paginate."""
    soup = BeautifulSoup(html, "html.parser")
    # 1) Explicit 'Next' text
    for a in soup.find_all("a", href=True):
        text = (a.get_text(strip=True) or "").lower()
        if text in ("next", "next »", "»", "next>"):
            return urljoin(base_url, a["href"])
    # 2) rel="next"
    link_next = soup.find("a", rel=lambda x: x and "next" in x.lower())
    if link_next and link_next.get("href"):
        return urljoin(base_url, link_next["href"])
    # 3) Heuristic: any link that advances a startrow/page param
    for a in soup.find_all("a", href=True):
        if re.search(r"(start|page|offset|startrow)=", a["href"], re.I) and ("prev" not in a.get_text("", strip=True).lower()):
            return urljoin(base_url, a["href"])
    return None

def parse_header_title(text: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    """
    Expect header like: '601001 - Young River - Neds Corner'
    Returns (site, river, station_name)
    """
    if not text:
        return None, None, None
    parts = [p.strip() for p in text.split("-")]
    parts = [p for p in parts if p]  # remove empties
    # Try to match: [site, river, station]
    if len(parts) >= 3 and parts[0].isdigit() and len(parts[0]) == 6:
        site = parts[0]
        river = parts[1]
        station = "-".join(parts[2:]).strip()
        return site, river, station
    # Fallback: find 6-digit site anywhere and take the rest
    m = SITE_ID_RE.search(text)
    site = m.group(1) if m else None
    # Try to split remaining by ' - '
    if " - " in text:
        chunks = [c.strip() for c in text.split(" - ")]
        if len(chunks) >= 3:
            river, station = chunks[1], chunks[2]
            return site, river, station
    return site, None, None

def parse_decimal_after_label(soup: BeautifulSoup, label: str) -> Optional[float]:
    """
    Find decimal immediately after a label, e.g. 'Latitude:' or 'Longitude:'.
    Accepts negative numbers.
    """
    # Search the full text for "Label:  -33.70" or "Label: 121.14"
    text = soup.get_text(" ", strip=True)
    m = re.search(rf"{label}\s*:\s*([\-+]?\d+(?:\.\d+)?)", text, re.I)
    if m:
        try:
            return float(m.group(1))
        except Exception:
            return None
    return None

def parse_dms_to_decimal(dms_str: str) -> Optional[float]:
    """
    Convert DMS like 33°42'21.7"S or 121°08'31.4"E to decimal degrees.
    South/West are negative.
    """
    if not dms_str:
        return None
    m = re.search(r"(\d+)[°:\s]+(\d+)[\'’:\s]+(\d+(?:\.\d+)?)[\"”]?\s*([NSEW])", dms_str, re.I)
    if not m:
        return None
    deg, minu, sec, hemi = m.groups()
    deg, minu, sec = float(deg), float(minu), float(sec)
    dec = deg + minu/60.0 + sec/3600.0
    if hemi.upper() in ("S", "W"):
        dec = -dec
    return dec

def find_first_dms_after_label(text: str, label: str) -> Optional[str]:
    """Extract the first DMS-looking token after a label keyword in the page text."""
    pos = text.lower().find(label.lower())
    if pos == -1:
        return None
    tail = text[pos:pos+200]  # small window after label
    m = re.search(r"(\d{1,3}°\d{1,2}\'\d{1,2}(?:\.\d+)?\"[NSEW])", tail)
    return m.group(1) if m else None

def parse_station_summary(html: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[float], Optional[float]]:
    """
    Parse a Station Summary page (sitesum.htm) and extract:
    site, river, station_name, latitude, longitude
    """
    soup = BeautifulSoup(html, "html.parser")

    # 1) Try to get the title/header line that contains 'site - river - station'
    header_text = None
    # common: <h1>, <h2>, <b>, or first big bold line
    for tag in ["h1", "h2", "h3", "b", "strong", "title"]:
        el = soup.find(tag)
        if el and el.get_text(strip=True):
            header_text = el.get_text(" ", strip=True)
            break
    if not header_text:
        # fallback: first line of body text
        header_text = soup.get_text("\n", strip=True).splitlines()[0] if soup.get_text(strip=True) else ""

    site, river, station_name = parse_header_title(header_text)

    # 2) Latitude/Longitude: prefer decimals; fallback to DMS if needed
    lat = parse_decimal_after_label(soup, "Latitude")
    lon = parse_decimal_after_label(soup, "Longitude")

    if lat is None or lon is None:
        full_text = soup.get_text(" ", strip=True)
        if lat is None:
            dms_lat = find_first_dms_after_label(full_text, "Latitude")
            lat = parse_dms_to_decimal(dms_lat) if dms_lat else None
        if lon is None:
            dms_lon = find_first_dms_after_label(full_text, "Longitude")
            lon = parse_dms_to_decimal(dms_lon) if dms_lon else None

    return site, river, station_name, lat, lon

# ---------- Crawl orchestrator ----------
def collect_all_site_ids(session: requests.Session) -> List[str]:
    """Follow 'Next' pagination to gather site IDs from ALL pages."""
    site_ids = set()
    next_url = BASE_LIST_URL
    page_idx = 1

    while next_url:
        print(f"[List] Fetch page {page_idx}: {next_url}")
        html = get_html(session, next_url)
        ids = find_site_ids_on_list_page(html, next_url)
        site_ids.update(ids)
        nxt = find_next_page_url(html, next_url)
        if nxt and nxt != next_url:
            next_url = nxt
            page_idx += 1
            time.sleep(REQUEST_DELAY_SEC)
        else:
            break

    site_ids = sorted(site_ids)
    print(f"Found {len(site_ids)} unique site IDs.")
    return site_ids

def crawl_all_station_summaries() -> pd.DataFrame:
    session = requests.Session()
    all_rows = []

    # 1) gather all site IDs
    site_ids = collect_all_site_ids(session)

    # 2) visit each Station Summary
    for i, site in enumerate(site_ids, start=1):
        url = SUMMARY_URL_TMPL.format(site=site)
        print(f"[{i}/{len(site_ids)}] Fetch summary: {url}")
        try:
            html = get_html(session, url)
        except Exception as e:
            print(f"   !! failed: {e}")
            time.sleep(REQUEST_DELAY_SEC)
            continue

        site_id, river, station_name, lat, lon = parse_station_summary(html)

        all_rows.append({
            "site": site_id or site,
            "river": river,
            "station_name": station_name,
            "latitude": lat,
            "longitude": lon,
        })

        time.sleep(REQUEST_DELAY_SEC)

    # 3) build DataFrame and save
    df = pd.DataFrame(all_rows, columns=["site", "river", "station_name", "latitude", "longitude"])
    df.to_csv(OUT_CSV, index=False, encoding="utf-8")
    print(f"\nSaved {len(df)} rows to: {OUT_CSV.resolve()}")
    # quick sanity counts
    missing_lat = df["latitude"].isna().sum()
    missing_lon = df["longitude"].isna().sum()
    print(f"Missing latitude: {missing_lat}, Missing longitude: {missing_lon}")
    return df

if __name__ == "__main__":
    crawl_all_station_summaries()

[List] Fetch page 1: https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm
Found 244 unique site IDs.
[1/244] Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601001/sitesum.htm
[2/244] Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601004/sitesum.htm
[3/244] Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601005/sitesum.htm
[4/244] Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601008/sitesum.htm
[5/244] Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601009/sitesum.htm
[6/244] Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/602001/sitesum.htm
[7/244] Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/602004/sitesum.htm
[8/244] Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/602009/sitesum.htm
[9/244] 

KeyboardInterrupt: 

In [None]:
import re
import time
import pathlib
from typing import List, Optional
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_LIST_URL = "https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm"
SUMMARY_URL_TMPL = "https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/{site}/sitesum.htm"

OUT_CSV = pathlib.Path("stations_summary_ALL.csv")
HEADERS = {"User-Agent": "Mozilla/5.0 (data-collection; contact: you@example.com)"}
TIMEOUT = 30
REQUEST_DELAY_SEC = 0.7   # be polite to the server

SITE_ID_RE = re.compile(r"\b(\d{6})\b")

def get_html(session: requests.Session, url: str) -> str:
    r = session.get(url, headers=HEADERS, timeout=TIMEOUT)
    r.raise_for_status()
    return r.text

def find_site_ids_on_list_page(html: str, base_url: str) -> List[str]:
    """Collect 6-digit site IDs from the list page (link text and hrefs)."""
    soup = BeautifulSoup(html, "html.parser")
    site_ids = set()
    for a in soup.find_all("a", href=True):
        href = a["href"] or ""
        txt = a.get_text(" ", strip=True) or ""
        for s in (href, txt):
            m = SITE_ID_RE.search(s)
            if m:
                site_ids.add(m.group(1))
        m2 = re.search(r"/publish/(\d{6})/", href)
        if m2:
            site_ids.add(m2.group(1))
    return sorted(site_ids)

def find_next_page_url(html: str, base_url: str) -> Optional[str]:
    """Find 'Next' pagination link (common patterns)."""
    soup = BeautifulSoup(html, "html.parser")
    # explicit 'Next'
    for a in soup.find_all("a", href=True):
        text = (a.get_text(strip=True) or "").lower()
        if text in ("next", "next »", "»", "next>"):
            return urljoin(base_url, a["href"])
    # rel="next"
    link_next = soup.find("a", rel=lambda x: x and "next" in x.lower())
    if link_next and link_next.get("href"):
        return urljoin(base_url, link_next["href"])
    # heuristic: any link with start/page/offset params
    for a in soup.find_all("a", href=True):
        if re.search(r"(start|page|offset|startrow)=", a["href"], re.I) and ("prev" not in (a.get_text("", strip=True) or "").lower()):
            return urljoin(base_url, a["href"])
    return None

HEADER_LINE_RE = re.compile(r"""^\s*(\d{6})\s*-\s*(.+?)\s*-\s*(.+?)\s*$""", re.MULTILINE)
DMS_RE = re.compile(r"(\d+)[°:\s]+(\d+)[\'’:\s]+(\d+(?:\.\d+)?)[\"”]?\s*([NSEW])", re.I)

def extract_header_triplet_from_text(raw_text: str):
    """
    Find a line like:
        601001 - Young River - Neds Corner
    Return (site, river, station_name)
    """
    m = HEADER_LINE_RE.search(raw_text)
    if m:
        site = m.group(1).strip()
        river = m.group(2).strip()
        station = m.group(3).strip()
        return site, river, station
    return None, None, None

def parse_decimal_after_label(raw_text: str, label: str) -> Optional[float]:
    m = re.search(rf"{label}\s*:\s*([\-+]?\d+(?:\.\d+)?)", raw_text, re.I)
    if m:
        try:
            return float(m.group(1))
        except Exception:
            return None
    return None

def dms_to_decimal(dms: str) -> Optional[float]:
    mm = DMS_RE.search(dms or "")
    if not mm:
        return None
    deg, minu, sec, hemi = mm.groups()
    dec = float(deg) + float(minu)/60.0 + float(sec)/3600.0
    if hemi.upper() in ("S", "W"):
        dec = -dec
    return dec

def parse_station_summary(html: str):
    """
    Returns (site, river, station_name, latitude, longitude)
    Strategy:
      - Prefer text from <pre> (fixed-width report), else use full-body text.
      - Header line via regex.
      - Latitude/Longitude: prefer decimals after labels; fallback to DMS near labels.
    """
    soup = BeautifulSoup(html, "html.parser")
    pre = soup.find("pre")
    raw_text = pre.get_text("\n", strip=True) if pre else soup.get_text("\n", strip=True)

    # Header triplet
    site, river, station_name = extract_header_triplet_from_text(raw_text)

    # Coordinates
    lat = parse_decimal_after_label(raw_text, "Latitude")
    lon = parse_decimal_after_label(raw_text, "Longitude")

    if lat is None or lon is None:
        # Look for DMS tokens near labels (small window after the label)
        for label in ("Latitude", "Longitude"):
            pos = raw_text.lower().find(label.lower())
            if pos != -1:
                window = raw_text[pos:pos+200]
                dec = dms_to_decimal(window)
                if label == "Latitude" and lat is None:
                    lat = dec
                if label == "Longitude" and lon is None:
                    lon = dec

    return site, river, station_name, lat, lon

def collect_all_site_ids(session: requests.Session) -> List[str]:
    """Follow pagination and collect ALL unique site IDs."""
    site_ids, seen = [], set()
    next_url, page = BASE_LIST_URL, 1
    while next_url:
        print(f"[List] Page {page}: {next_url}")
        html = get_html(session, next_url)
        ids = find_site_ids_on_list_page(html, next_url)
        added = 0
        for sid in ids:
            if sid not in seen:
                seen.add(sid)
                site_ids.append(sid)
                added += 1
        print(f"  +{added} new (total {len(site_ids)})")
        nxt = find_next_page_url(html, next_url)
        if nxt and nxt != next_url:
            next_url = nxt
            page += 1
            time.sleep(REQUEST_DELAY_SEC)
        else:
            break
    return site_ids

def main():
    session = requests.Session()

    print("[1/3] Collecting ALL site IDs …")
    site_ids = collect_all_site_ids(session)
    print(f"Found {len(site_ids)} unique sites.")

    rows = []
    for i, site in enumerate(site_ids, start=1):
        url = SUMMARY_URL_TMPL.format(site=site)
        print(f"[2/3] ({i}/{len(site_ids)}) Fetch summary: {url}")
        try:
            html = get_html(session, url)
        except Exception as e:
            print(f"    !! failed: {e}")
            time.sleep(REQUEST_DELAY_SEC)
            continue

        site_id, river, station_name, lat, lon = parse_station_summary(html)
        rows.append({
            "site": site_id or site,
            "river": river,
            "station_name": station_name,
            "latitude": lat,
            "longitude": lon,
        })

        time.sleep(REQUEST_DELAY_SEC)

    df = pd.DataFrame(rows, columns=["site", "river", "station_name", "latitude", "longitude"])

    # Optional: drop rows with all-None (if any pages failed to parse)
    all_info_missing = df[["river", "station_name", "latitude", "longitude"]].isna().all(axis=1)
    if all_info_missing.any():
        print(f"Note: dropping {all_info_missing.sum()} rows with no parsed info at all.")
        df = df[~all_info_missing].copy()

    df.to_csv(OUT_CSV, index=False, encoding="utf-8")
    print(f"\n[3/3] Saved {len(df)} rows to: {OUT_CSV.resolve()}")
    print("Missing latitude:", df['latitude'].isna().sum(), "Missing longitude:", df['longitude'].isna().sum())

if __name__ == "__main__":
    main()

[1/3] Collecting ALL site IDs …
[List] Page 1: https://kumina.water.wa.gov.au/waterinformation/telem/stage.cfm
  +244 new (total 244)
Found 244 unique sites.
[2/3] (1/244) Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601001/sitesum.htm
[2/3] (2/244) Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601004/sitesum.htm
[2/3] (3/244) Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601005/sitesum.htm
[2/3] (4/244) Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601008/sitesum.htm
[2/3] (5/244) Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/601009/sitesum.htm
[2/3] (6/244) Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/602001/sitesum.htm
[2/3] (7/244) Fetch summary: https://kumina.water.wa.gov.au/waterinformation/wir/reports/publish/602004/sitesum.htm
[2/3] (8/244) Fetch summary: h