
# Phase 2 â€“ Build Final Dataset (SBP-only)



**Outputs**
- `../data/processed/heart_attack_china_clean.csv`
- `../data/processed/heart_attack_china_final.csv`


In [2]:

# Adjust nothing if this notebook is inside the 'notebooks/' folder.
RAW_PATH = "../data/raw/heart_attack_china.csv"
INTERIM_PATH = "../data/processed/heart_attack_china_clean.csv"
FINAL_PATH = "../data/processed/heart_attack_china_final.csv"
print("Using paths:", RAW_PATH, INTERIM_PATH, FINAL_PATH)


Using paths: ../data/raw/heart_attack_china.csv ../data/processed/heart_attack_china_clean.csv ../data/processed/heart_attack_china_final.csv


In [3]:

import pandas as pd
from pathlib import Path

def read_raw(path: str) -> pd.DataFrame:
    """Read raw CSV safely with UTF-8 and low_memory=False for predictable dtypes."""
    return pd.read_csv(path, encoding="utf-8", low_memory=False)

def clean_sbp(df: pd.DataFrame) -> pd.DataFrame:
    """
    Beginner-cleaning step for SBP:
    - Rename Blood_Pressure -> SBP (if exists)
    - Coerce SBP to numeric (invalid parses -> NaN)
    - Add SBP_missing flag (True if NaN)
    """
    rename_map = {}
    if "Blood_Pressure" in df.columns:
        rename_map["Blood_Pressure"] = "SBP"
    # If your column is already 'SBP', this does nothing.
    df = df.rename(columns=rename_map)

    if "SBP" not in df.columns:
        raise ValueError("Column 'Blood_Pressure' or 'SBP' not found in the raw dataset.")

    # Coerce to numeric and build missing flag
    df["SBP"] = pd.to_numeric(df["SBP"], errors="coerce")
    df["SBP_missing"] = df["SBP"].isna()

    return df

def build_final(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build minimal final dataset features.
    - SBP_hypertensive: 1 if SBP >= 140 else 0 (Int64 for nullable integer)
    - Keep only a small, clear set of columns
    """
    HYPERTENSION_SBP = 140
    df["SBP_hypertensive"] = (df["SBP"] >= HYPERTENSION_SBP).astype("Int64")

    keep = ["SBP", "SBP_missing", "SBP_hypertensive"]
    final_df = df[keep].copy()
    return final_df

def write_csv(df: pd.DataFrame, path: str) -> None:
    """Write CSV with UTF-8 encoding. Creates parent folders if needed."""
    out_path = Path(path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_path, index=False, encoding="utf-8")


In [4]:

print(" Reading raw CSV ...")
raw_df = read_raw(RAW_PATH)

print(" Cleaning SBP ...")
clean_df = clean_sbp(raw_df)
write_csv(clean_df, INTERIM_PATH)
print(f" Wrote interim CSV: {INTERIM_PATH}")

print(" Building final features ...")
final_df = build_final(clean_df)
write_csv(final_df, FINAL_PATH)
print(f" Wrote final CSV: {FINAL_PATH}")

print(" Done. Final dataset ready.")
final_df.head()


 Reading raw CSV ...
 Cleaning SBP ...
 Wrote interim CSV: ../data/processed/heart_attack_china_clean.csv
 Building final features ...
 Wrote final CSV: ../data/processed/heart_attack_china_final.csv
 Done. Final dataset ready.


Unnamed: 0,SBP,SBP_missing,SBP_hypertensive
0,104,False,0
1,142,False,1
2,176,False,1
3,178,False,1
4,146,False,1
