In [13]:
"""
clean_workforce.py  ────────────────────────────────────────────
Tidy the combined CIHI workforce sheet (80-column wide layout)
into a long table:
    province | year | profession | count | per_100k

Usage:
    python clean_workforce.py
"""

# ── 0. imports & config ───────────────────────────────────────
import re
from pathlib import Path
import pandas as pd

FILE    = Path("C://Users//shoaib//Documents//CIHI_HealthWorkforce_Prov_2019_2023-1.xlsx")
OUT_CSV = Path("clean/cihi_workforce_long.csv")
OUT_CSV.parent.mkdir(exist_ok=True)

SUPPRESSION = {"–", "-", "—", "<5", "<6", "<10", "Suppressed", "NR"}
PROV_MAP = {
    "newfoundland and labrador":"NL","nl":"NL",
    "prince edward island":"PE","pei":"PE","p.e.i.":"PE",
    "nova scotia":"NS","ns":"NS",
    "new brunswick":"NB","nb":"NB",
    "quebec":"QC","qc":"QC",
    "ontario":"ON","on":"ON",
    "manitoba":"MB","mb":"MB",
    "saskatchewan":"SK","sk":"SK",
    "alberta":"AB","ab":"AB",
    "british columbia":"BC","bc":"BC",
    "yukon":"YT","yt":"YT",
    "northwest territories":"NT","nt":"NT",
    "nunavut":"NU","nu":"NU",
}

# ── 1. read the single sheet ─────────────────────────────────
df = pd.read_excel(FILE, sheet_name=0, header=0)

# ── 2. forward-fill the profession name (rows below each group are NaN)
df["Type of professional"].ffill(inplace=True)

# ── 3. replace suppression tokens with NA
df.replace(list(SUPPRESSION), pd.NA, inplace=True)

# ── 4. melt wide → long  (province + metric in one string)
id_cols = ["Type of professional", "Year"]
long = (df
        .melt(id_vars=id_cols,
              var_name="prov_metric",
              value_name="value")
        .dropna(subset=["value"]))

# ── 5. split "Alberta: Count" → province / metric
long[["province_raw", "metric"]] = long["prov_metric"].str.split(":", n=1, expand=True)
long["metric"]   = long["metric"].str.strip().str.lower()          # "count", "per 100,000 population", etc.
long["province"] = (long["province_raw"].str.lower().str.strip()
                                      .map(PROV_MAP)
                                      .fillna(long["province_raw"].str.strip()))
# keep only rows that have recognised province codes
long = long[long["province"].isin(PROV_MAP.values())]

# ── 6. pivot so each metric is its own column  ----------------
tidy = (long
        .pivot_table(index=["province", "Year", "Type of professional"],
                     columns="metric",
                     values="value",
                     aggfunc="first")
        .reset_index())

# ── 7. rename & cast types   ----------------------------------
tidy.rename(columns={
    "Type of professional":"profession",
    "Year":"year",
    "count":"count",
    "per 100,000 population":"per_100k",
    # (add more renames if you decide to keep fte, female %, etc.)
}, inplace=True)

tidy["year"]  = tidy["year"].astype("int16")
tidy["count"] = pd.to_numeric(tidy["count"], errors="coerce")
tidy["per_100k"] = pd.to_numeric(tidy.get("per_100k"), errors="coerce")

# ── 8. export   ------------------------------------------------
tidy.to_csv(OUT_CSV, index=False)
print(f"✅  Clean workforce table → {OUT_CSV}   rows={len(tidy):,}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Type of professional"].ffill(inplace=True)


✅  Clean workforce table → clean\cihi_workforce_long.csv   rows=1,895
