In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, recall_score, precision_score, classification_report
from sklearn.calibration import CalibratedClassifierCV
import warnings
warnings.filterwarnings('ignore')

In [7]:
# ========================== 1. LOAD & CLEAN ==========================  
df = pd.read_csv("CardioShieldDataSet.csv")

# Basic cleaning (common issues in this dataset)
df = df[(df['ap_hi'] > 50) & (df['ap_hi'] < 250)]
df = df[(df['ap_lo'] > 30) & (df['ap_lo'] < 150)]
df = df[df['ap_hi'] > df['ap_lo']]
df = df[(df['height'] > 100) & (df['height'] < 220)]
df = df[(df['weight'] > 30) & (df['weight'] < 200)]

In [8]:
# ========================== 2. FEATURE ENGINEERING  ==========================
df['age_years'] = df['age'] / 365.25
df['bmi'] = df['weight'] / (df['height']/100)**2
df['pulse_pressure'] = df['ap_hi'] - df['ap_lo']
df['age_bp_inter'] = df['age_years'] * df['ap_hi']
df['gluc_bmi_inter'] = df['gluc'] * df['bmi']

In [9]:
# BMI categories
def bmi_cat(x):
    if x < 18.5: return 0
    elif x < 25: return 1
    elif x < 30: return 2
    else: return 3
df['bmi_cat'] = df['bmi'].apply(bmi_cat)

In [10]:
# BP category (clinical)
def bp_cat(row):
    if row['ap_hi'] < 120 and row['ap_lo'] < 80: return 0
    elif row['ap_hi'] < 130 and row['ap_lo'] < 85: return 1
    elif row['ap_hi'] < 140 or row['ap_lo'] < 90: return 2
    else: return 3
df['bp_cat'] = df.apply(bp_cat, axis=1)

In [11]:
# Composite risk index
df['composite_risk'] = (df['ap_hi']>140).astype(int) + \
                       (df['cholesterol']>1).astype(int) + \
                       (df['smoke']==1).astype(int) + \
                       (df['bmi']>30).astype(int)

In [12]:
features = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
            'cholesterol', 'gluc', 'smoke', 'alco', 'active',
            'bmi', 'pulse_pressure', 'age_bp_inter', 'gluc_bmi_inter',
            'bmi_cat', 'bp_cat', 'composite_risk']

X = df[features]
y = df['cardio']

In [13]:
# ========================== 3. PREPROCESSING (PDF 5.3 + 5.4) ==========================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)