In [24]:
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

RAW_DIR = "../raw/"
PROCESSED_DIR = "../processed/"
os.makedirs(PROCESSED_DIR, exist_ok=True)

BP_RAW_FILE   = os.path.join(RAW_DIR, "BloodPressureData.csv")
BP_PEOPLE_OUT = os.path.join(PROCESSED_DIR, "bp_people_features.csv")

print("Raw file exists:", os.path.exists(BP_RAW_FILE))


Raw file exists: True


In [25]:
raw = pd.read_csv(BP_RAW_FILE)
print(raw.columns.tolist())
raw.head()


['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps', 'Sleep Disorder']


Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [26]:
def clean_bp_people_table(df: pd.DataFrame) -> pd.DataFrame:
    # standardize headers -> keep originals for readability
    cols = {c: re.sub(r"\s+", " ", c.strip()) for c in df.columns}
    df = df.rename(columns=cols)

    if "Blood Pressure" not in df.columns:
        raise ValueError("Expected a 'Blood Pressure' column (e.g., '120/80').")

    # extract two integers from the BP text: supports '120/80', '120 - 80', '120 / 80 mmHg', etc.
    def extract_sbp_dbp(x):
        if pd.isna(x):
            return pd.Series([np.nan, np.nan])
        m = re.findall(r"\d{2,3}", str(x))
        if len(m) >= 2:
            s, d = int(m[0]), int(m[1])
            return pd.Series([s, d])
        return pd.Series([np.nan, np.nan])

    sbp_dbp = df["Blood Pressure"].apply(extract_sbp_dbp)
    df["SBP"] = sbp_dbp[0]
    df["DBP"] = sbp_dbp[1]

    # optional heart rate column harmonization
    hr_col = None
    for cand in ["Heart Rate", "HR", "Pulse", "Pulse Rate", "Pulse(bpm)"]:
        if cand in df.columns:
            hr_col = cand
            break
    if hr_col:
        df["HR"] = pd.to_numeric(df[hr_col], errors="coerce")

    # clip implausible values
    df["SBP"] = df["SBP"].clip(70, 260)
    df["DBP"] = df["DBP"].clip(40, 150)
    if "HR" in df.columns:
        df["HR"] = df["HR"].clip(30, 220)

    # derived features
    df["PULSE_PRESSURE"] = df["SBP"] - df["DBP"]
    df["MAP"] = df["DBP"] + df["PULSE_PRESSURE"] / 3.0

    # simple BP category
    def bp_cat(s, d):
        if pd.isna(s) or pd.isna(d): return np.nan
        if s < 120 and d < 80: return "normal"
        if 120 <= s < 130 and d < 80: return "elevated"
        if (130 <= s < 140) or (80 <= d < 90): return "htn_stage1"
        if (140 <= s < 180) or (90 <= d < 120): return "htn_stage2"
        if s >= 180 or d >= 120: return "crisis"
        return "unknown"
    df["bp_category"] = df.apply(lambda r: bp_cat(r["SBP"], r["DBP"]), axis=1)

    # keep useful columns (preserve ID + context if present)
    keep_order = [c for c in ["Person ID","Gender","Age","Occupation","Sleep Duration",
                              "Quality of Sleep","Physical Activity Level","Stress Level",
                              "BMI Category","Daily Steps","Sleep Disorder"]
                  if c in df.columns]
    keep_order += ["SBP","DBP","HR","PULSE_PRESSURE","MAP","bp_category"]

    return df[keep_order]


In [27]:
bp_people = clean_bp_people_table(pd.read_csv(BP_RAW_FILE))
bp_people.to_csv(BP_PEOPLE_OUT, index=False)

print(f"✅ Saved per-person BP features → {BP_PEOPLE_OUT}  rows={len(bp_people)}")
bp_people.head()


✅ Saved per-person BP features → ../processed/bp_people_features.csv  rows=374


Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Daily Steps,Sleep Disorder,SBP,DBP,HR,PULSE_PRESSURE,MAP,bp_category
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,4200,,126,83,77,43,97.333333,htn_stage1
1,2,Male,28,Doctor,6.2,6,60,8,Normal,10000,,125,80,75,45,95.0,htn_stage1
2,3,Male,28,Doctor,6.2,6,60,8,Normal,10000,,125,80,75,45,95.0,htn_stage1
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,3000,Sleep Apnea,140,90,85,50,106.666667,htn_stage2
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,3000,Sleep Apnea,140,90,85,50,106.666667,htn_stage2
