# Model Training

## 1 Import data and required packages

In [28]:
# !pip install scikit-learn catboost xgboost

In [29]:
# Basics
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Modelling (Regression)
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
# from sklearn.svm import SVR
# from sklearn.linear_model import LinearRegression,LogisticRegression, Ridge, Lasso
# from sklearn.model_selection import RandomizedSearchCV
# from catboost import CatBoostRegressor
# from xgboost import XGBRegressor

#Modelling (Classification)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import warnings

### 1.1 Get data

In [30]:
df = pd.read_csv('data/heart_clean.csv')
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadAngina,...,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HasHeartDisease
0,Alabama,Female,Very good,4.0,0.0,Within past year,Yes,9.0,None of them,No,...,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year,Yes,6.0,None of them,No,...,95.25,30.13,No,No,Yes,Yes,"Yes, but dont know type",No,No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year,No,8.0,"6 or more, but not all",No,...,108.86,31.66,Yes,No,No,Yes,No for past 10 yrs,No,Yes,No
3,Alabama,Female,Fair,5.0,0.0,Within past year,Yes,9.0,None of them,No,...,90.72,31.32,No,No,Yes,Yes,No for past 10 yrs,No,Yes,No
4,Alabama,Female,Good,3.0,15.0,Within past year,Yes,5.0,1 to 5,No,...,79.38,33.07,No,No,Yes,Yes,No for past 10 yrs,No,No,No


## 2 Preparing X and Y variables

In [31]:
X = df.drop(columns=['HasHeartDisease'], axis=1)
X.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadAngina,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, but dont know type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,No for past 10 yrs,No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,No for past 10 yrs,No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,No for past 10 yrs,No,No


In [32]:
y = df['HasHeartDisease']
y.head()

0    No
1    No
2    No
3    No
4    No
Name: HasHeartDisease, dtype: object

## 3 Create Column Transformer with 3 types of transformers
- need to perform one hot encoding for everything for logistic regression
- then try to apply standardization or normalization
- this should be in the form of a pipeline, step by step

In [33]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

y = df['HasHeartDisease'].map({'No': 0, 'Yes': 1})  # <-- Convert target to 0/1

from sklearn.preprocessing import OneHotEncoder, StandardScaler #OHE and standardization
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),
    ]
)

In [34]:
X = preprocessor.fit_transform(X)

## 4 Separate dataset into train and test

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((196817, 154), (49205, 154))

## 5 Give an Evaluate Function to give all metrics after model training

### 5.1 For regression (not what I'm using for this datset)

## 5.2 For classification

In [36]:
def evaluate_model(true, predicted, predicted_proba=None):
    accuracy = accuracy_score(true, predicted)       # Overall fraction of correct predictions
    precision = precision_score(true, predicted)     # Out of all predicted positives, how many were correct
    recall = recall_score(true, predicted)           # Out of all actual positives, how many did we catch
    f1 = f1_score(true, predicted)                   # Harmonic mean of precision and recall, balance of both
    auc = roc_auc_score(true, predicted_proba) if predicted_proba is not None else None  # Model's ability to rank positives higher than negatives
    return accuracy, precision, recall, f1, auc

In [52]:
models = {
    "Logistic Regression": LogisticRegression(n_jobs=-1),
    "Random Forest": RandomForestClassifier(n_jobs=-1),
    "XGBoost": XGBClassifier(n_jobs=-1),
    "CatBoost": CatBoostClassifier(verbose=False),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_jobs=-1)
}

model_list=[]
auc_list=[]
prec_list=[]
recall_list=[]
f1_list=[]

for name, model in models.items():
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Check if model can predict probabilities (needed for ROC AUC)
    if hasattr(model, "predict_proba"):
        y_test_proba = model.predict_proba(X_test)[:, 1] # Probabilities of positive class on test set
    else:
        y_test_proba = None
    
    train_metrics = evaluate_model(y_train, y_train_pred)
    test_metrics = evaluate_model(y_test, y_test_pred, y_test_proba)
    
    print(f"Model: {name}")
    print("Train Metrics:")
    print(f"  Accuracy: {train_metrics[0]:.4f}")
    print(f"  Precision: {train_metrics[1]:.4f}")
    print(f"  Recall: {train_metrics[2]:.4f}")
    print(f"  F1 Score: {train_metrics[3]:.4f}")
    
    print("Test Metrics:")
    print(f"  Accuracy: {test_metrics[0]:.4f}")
    print(f"  Precision: {test_metrics[1]:.4f}")
    print(f"  Recall: {test_metrics[2]:.4f}")
    print(f"  F1 Score: {test_metrics[3]:.4f}")
    if test_metrics[4] is not None:
        print(f"  ROC AUC: {test_metrics[4]:.4f}")
    print("="*40)

    model_list.append(name)
    auc_list.append(test_metrics[0])
    prec_list.append(test_metrics[1])
    recall_list.append(test_metrics[2])
    f1_list.append(test_metrics[3])


Model: Logistic Regression
Train Metrics:
  Accuracy: 0.9486
  Precision: 0.5720
  Recall: 0.2555
  F1 Score: 0.3532
Test Metrics:
  Accuracy: 0.9490
  Precision: 0.5516
  Recall: 0.2519
  F1 Score: 0.3459
  ROC AUC: 0.8876
Model: Random Forest
Train Metrics:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 0.9993
  F1 Score: 0.9996
Test Metrics:
  Accuracy: 0.9490
  Precision: 0.5779
  Recall: 0.1706
  F1 Score: 0.2634
  ROC AUC: 0.8660
Model: XGBoost
Train Metrics:
  Accuracy: 0.9588
  Precision: 0.7760
  Recall: 0.3496
  F1 Score: 0.4821
Test Metrics:
  Accuracy: 0.9488
  Precision: 0.5499
  Recall: 0.2367
  F1 Score: 0.3309
  ROC AUC: 0.8834
Model: CatBoost
Train Metrics:
  Accuracy: 0.9630
  Precision: 0.8634
  Recall: 0.3862
  F1 Score: 0.5337
Test Metrics:
  Accuracy: 0.9492
  Precision: 0.5580
  Recall: 0.2375
  F1 Score: 0.3332
  ROC AUC: 0.8887
Model: Decision Tree
Train Metrics:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000
Test Metrics:
  Accura

## 6 Results

In [53]:
pd.DataFrame(list(zip(model_list, auc_list, prec_list, recall_list, f1_list)), 
            columns = ['Model Name', 'Auccuracy', 'Precision', 'Recall', 'F1-Score'])

Unnamed: 0,Model Name,Auccuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.94903,0.551581,0.2519,0.345853
1,Random Forest,0.948969,0.577864,0.170593,0.26342
2,XGBoost,0.948806,0.549868,0.236702,0.330943
3,CatBoost,0.949152,0.558036,0.237462,0.333156
4,Decision Tree,0.919724,0.267794,0.288754,0.277879
5,K-Nearest Neighbors,0.945371,0.440928,0.079407,0.134578
