In [None]:
import re
from pathlib import Path
import pandas as pd

FILE          = Path("C://Users//shoaib//Desktop//Mental_health_ed_project//PHO_Substance_ED_Hosp_Deaths_PHU_2014_2024.xlsx")
OUT_DIR       = Path("clean")
OUT_DIR.mkdir(exist_ok=True)

SUPPRESSION   = {"–", "-", "—", "<5", "<6", "<10",
                 "Suppressed", "NR", "N/A", "NA", "n/a", "na"}

NUMERIC_HINT  = [
    "number_of_cases",
    "population",
    "cases_per_100_000_population_annualized",
]

KEEP_INTERVAL = "Monthly"     # ← change to "Quarterly" or "Yearly" if you prefer

# ── 2. helper functions ────────────────────────────────────────
def snake(text: str) -> str:
    return re.sub(r"[^0-9a-z]+", "_", text.strip().lower()).strip("_")


def locate_header(raw_df: pd.DataFrame) -> int:
    mask = raw_df.apply(
        lambda r: r.astype(str)
        .str.contains("public health unit", case=False, na=False)
        .any(),
        axis=1,
    )
    return mask.idxmax()


def clean_sheet(path: Path, sheet: str) -> pd.DataFrame:
    """Read → tidy → filter → return a truly clean DataFrame."""
    # 2.1 raw read (no header so nothing is coerced)
    raw = pd.read_excel(path, sheet_name=sheet, header=None, dtype=str)

    # 2.2 find real header row, re-read with proper header
    hdr_row = locate_header(raw)
    df = pd.read_excel(path, sheet_name=sheet, header=hdr_row)

    # 2.3 drop empty rows / cols
    df.dropna(how="all", axis=0, inplace=True)
    df.dropna(how="all", axis=1, inplace=True)

    # 2.4 snake-case column names & strip footnote symbols
    df.columns = [snake(c).replace("†", "").replace("‡", "") for c in df.columns]

    # 2.5 suppression tokens 
    df.replace(list(SUPPRESSION), pd.NA, inplace=True)

    # 2.6 remove banner / meta rows (indicator missing)
    if "indicator" in df.columns:
        before = len(df)
        df = df[df["indicator"].notna()].copy()
        dropped = before - len(df)
        if dropped:
            print(f"🔹  {sheet}: dropped {dropped} banner/meta rows")

    # 2.7 numeric casting
    for col in NUMERIC_HINT:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # 2.8 date parsing
    for col in ("start_date_of_time_period", "end_date_of_time_period"):
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")

    # 2.9 keep a single aggregation level to avoid duplicate keys
    if "time_interval" in df.columns:
        df = df[df["time_interval"].eq(KEEP_INTERVAL)].copy()

    # 2.10 duplicate warning after all filters
    key_cols = [
        c
        for c in [
            "public_health_unit",
            "indicator",
            "start_date_of_time_period",
        ]
        if c in df.columns
    ]
    if key_cols and df.duplicated(key_cols).any():
        dup_cnt = df.duplicated(key_cols).sum()
        print(f"{sheet}: {dup_cnt} duplicate rows remain on {key_cols}")

    return df

def main():
    assert FILE.exists(), f"Workbook not found: {FILE.resolve()}"
    xls = pd.ExcelFile(FILE)
    print(" Sheets detected ➜", ", ".join(xls.sheet_names))
    print("Keeping only", KEEP_INTERVAL, "rows from each sheet\n")

    parquet_ok = True
    try:
        import pyarrow  # noqa: F401
    except ImportError:
        try:
            import fastparquet  # noqa: F401
        except ImportError:
            parquet_ok = False
            print("pyarrow / fastparquet not installed – Parquet export skipped\n")

    for sheet in xls.sheet_names:
        df = clean_sheet(FILE, sheet)

        stem = f"{FILE.stem}_{snake(sheet)}_clean"
        csv_path = OUT_DIR / f"{stem}.csv"
        df.to_csv(csv_path, index=False)

        if parquet_ok:
            df.to_parquet(OUT_DIR / f"{stem}.parquet", index=False)

        print(f"{sheet:25s} ➜ {csv_path.name}   rows={len(df):,}")

    print("\n All worksheets processed. Clean files live in:", OUT_DIR.resolve())


# ── 4. run when executed as a script ───────────────────────────
if __name__ == "__main__":
    main()


 Sheets detected ➜ Export
Keeping only Monthly rows from each sheet

