
# Phase 2 — All‑in‑Notebook Build (No `.py` required)

This notebook **cleans and prepares** the Heart Attack (China) dataset and writes:
- `../data/processed/heart_attack_china_analysis_ready.csv`
- `../data/processed/heart_attack_china_model_ready.csv`

It also **optionally** reads a WHO file (if present) and adds a simple context column.

> Put this notebook in your repo at: `notebooks/04_phase2_all_in_notebook.ipynb`


## 1) Configure paths

In [7]:

RAW_PATH = "../data/raw/heart_attack_china.csv"
WHO_PATH = "../data/raw/who_health_china.csv" 
OUTDIR   = "../data/processed"

print("RAW:", RAW_PATH)
print("WHO:", WHO_PATH)
print("OUTDIR:", OUTDIR)


RAW: ../data/raw/heart_attack_china.csv
WHO: ../data/raw/who_health_china.csv
OUTDIR: ../data/processed


## 2) Imports + helper functions

In [2]:

import pandas as pd
from pathlib import Path

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns.str.strip()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^\w_]", "", regex=True)
    )
    return df

def strip_object_whitespace(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.select_dtypes(include="object").columns:
        df[c] = df[c].astype(str).str.strip()
    return df

def map_yes_no(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    yes_no_map = {"yes": 1, "no": 0, "y": 1, "n": 0, "true": 1, "false": 0}
    for c in df.select_dtypes(include="object").columns:
        ser = df[c].astype(str).str.lower()
        mask = ser.isin(yes_no_map.keys())
        if mask.any():
            df.loc[mask, c] = ser[mask].map(yes_no_map).astype("Int64")
    return df

def derive_flags(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # SBP from Blood_Pressure if present
    if "Blood_Pressure" in df.columns:
        df = df.rename(columns={"Blood_Pressure": "SBP"})
    if "SBP" in df.columns:
        df["SBP"] = pd.to_numeric(df["SBP"], errors="coerce")
        df["SBP_missing"] = df["SBP"].isna()
        df["SBP_hypertensive"] = (df["SBP"] >= 140).astype("Int64")

    # Gender_simple: M/F
    if "Gender" in df.columns:
        g = df["Gender"].astype(str).str.upper().str[0]
        g = g.replace({"F": "F", "M": "M"})
        df["Gender_simple"] = g.where(g.isin(["F", "M"]), pd.NA)

    # Smoker_flag from Smoking_Status
    if "Smoking_Status" in df.columns:
        s = df["Smoking_Status"].astype(str).str.lower()
        df["Smoker_flag"] = s.isin(["current", "former", "smoker", "heavy"]).astype("Int64")

    # Age_band bins
    if "Age" in df.columns:
        df["Age"] = pd.to_numeric(df["Age"], errors="coerce")
        bins = [0, 30, 40, 50, 60, 70, 200]
        labels = ["<30", "30-39", "40-49", "50-59", "60-69", "70+"]
        df["Age_band"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)

    # RiskFactor_count (sums these if present and Yes/No-ish)
    candidates = ["Hypertension","Diabetes","Obesity","Chronic_Kidney_Disease",
                  "Family_History_CVD","Previous_Heart_Attack"]
    present = [c for c in candidates if c in df.columns]
    for c in present:
        df[c] = pd.to_numeric(df[c], errors="ignore")
        if df[c].dtype == object:
            lc = df[c].astype(str).str.lower()
            df[c] = lc.map({"yes":1,"no":0}).astype("Int64")
    if present:
        df["RiskFactor_count"] = (
            df[present].apply(pd.to_numeric, errors="coerce").fillna(0).sum(axis=1).astype("Int64")
        )
    return df

def build_analysis_and_model_ready(df: pd.DataFrame):
    analysis_ready = df.copy()
    keep_cols = [c for c in [
        "Patient_ID","Age","Gender_simple","SBP","SBP_missing","SBP_hypertensive",
        "Smoker_flag","Cholesterol_Level","CVD_Risk_Score","Hypertension","Diabetes",
        "Obesity","RiskFactor_count","Heart_Attack"
    ] if c in df.columns]
    model_ready = df[keep_cols].copy()

    if "Heart_Attack" in model_ready.columns and model_ready["Heart_Attack"].dtype == object:
        model_ready["Heart_Attack"] = (
            model_ready["Heart_Attack"].astype(str).str.lower().map({"yes":1,"no":0}).astype("Int64")
        )
    return analysis_ready, model_ready


## 3) Load raw → clean → engineer features

In [3]:

# Load
raw = pd.read_csv(RAW_PATH, low_memory=False)
print("Raw shape:", raw.shape)

# Clean & engineer
df = strip_object_whitespace(raw)
df = normalize_columns(df)
df = map_yes_no(df)
df = derive_flags(df)

analysis_ready, model_ready = build_analysis_and_model_ready(df)
analysis_ready.shape, model_ready.shape

Raw shape: (239266, 28)


  df[c] = pd.to_numeric(df[c], errors="ignore")


((239266, 34), (239266, 14))

## 4) Optional WHO context (if file exists)

In [4]:

try:
    who = pd.read_csv(WHO_PATH, low_memory=False)
    who_china = who.query("indicator=='Mean Blood Pressure' and country=='China'")
    who_overall_mean = who_china["value"].mean()
    if "SBP" in analysis_ready.columns:
        analysis_ready["WHO_overall_mean_BP"] = who_overall_mean
    print("WHO overall mean BP (China):", round(who_overall_mean, 3))
except FileNotFoundError:
    print("WHO file not found; skipping (that's okay).")


WHO overall mean BP (China): 24.323


## 5) Save outputs

In [5]:

outdir = Path(OUTDIR)
outdir.mkdir(parents=True, exist_ok=True)

analysis_path = outdir / "heart_attack_china_analysis_ready.csv"
model_path    = outdir / "heart_attack_china_model_ready.csv"

analysis_ready.to_csv(analysis_path, index=False, encoding="utf-8")
model_ready.to_csv(model_path, index=False, encoding="utf-8")

print("Wrote:", analysis_path)
print("Wrote:", model_path)


Wrote: ..\data\processed\heart_attack_china_analysis_ready.csv
Wrote: ..\data\processed\heart_attack_china_model_ready.csv


## 6) Quick preview

In [6]:

display(analysis_ready.head())
display(model_ready.head())


Unnamed: 0,Patient_ID,Age,Gender,Smoking_Status,Hypertension,Diabetes,Obesity,Cholesterol_Level,Air_Pollution_Exposure,Physical_Activity,...,Previous_Heart_Attack,CVD_Risk_Score,Heart_Attack,SBP_missing,SBP_hypertensive,Gender_simple,Smoker_flag,Age_band,RiskFactor_count,WHO_overall_mean_BP
0,1,55,Male,Non-Smoker,0,0,1,Normal,High,High,...,0,78,0,False,0,M,0,50-59,2,24.322929
1,2,66,Female,Smoker,1,0,0,Low,Medium,High,...,0,49,0,False,1,F,1,60-69,2,24.322929
2,3,69,Female,Smoker,0,0,0,Low,Medium,High,...,0,31,0,False,1,F,1,60-69,0,24.322929
3,4,45,Female,Smoker,0,1,0,Normal,Medium,Low,...,1,23,0,False,1,F,1,40-49,2,24.322929
4,5,39,Female,Smoker,0,0,0,Normal,Medium,Medium,...,0,79,0,False,1,F,1,30-39,1,24.322929


Unnamed: 0,Patient_ID,Age,Gender_simple,SBP,SBP_missing,SBP_hypertensive,Smoker_flag,Cholesterol_Level,CVD_Risk_Score,Hypertension,Diabetes,Obesity,RiskFactor_count,Heart_Attack
0,1,55,M,104,False,0,0,Normal,78,0,0,1,2,
1,2,66,F,142,False,1,1,Low,49,1,0,0,2,
2,3,69,F,176,False,1,1,Low,31,0,0,0,0,
3,4,45,F,178,False,1,1,Normal,23,0,1,0,2,
4,5,39,F,146,False,1,1,Normal,79,0,0,0,1,
