## 1. Import Libraries

In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

### Importing Classic ML models

In [3]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

## 2. Load Processed Data

In [5]:
try:
    df = pd.read_csv('../data/processed/cleaned_test_data.csv')
    print("Processed data loaded successfully.")
    print(df.head())
except FileNotFoundError:
    print("Error: 'cleaned_test_data.csv' not found in 'data/processed/'.")
    print("Please ensure you have run the model_training script first.")

Processed data loaded successfully.
   num__age  num__trestbps  num__chol  num__thalach  num__oldpeak  cat__sex_1  \
0  0.697674       0.245283   0.273973      0.201613      0.354839         1.0   
1  0.534884       0.150943   0.171233      0.443548      0.241935         1.0   
2  0.604651       0.339623   0.182648      0.491935      0.387097         1.0   
3  0.534884       0.433962   0.150685      0.620968      0.064516         1.0   
4  0.674419       0.132075   0.326484      0.790323      0.290323         0.0   

   cat__cp_1  cat__cp_2  cat__cp_3  cat__fbs_1  ...  cat__exang_1  \
0        0.0        0.0        0.0         0.0  ...           1.0   
1        0.0        0.0        0.0         0.0  ...           1.0   
2        0.0        0.0        0.0         0.0  ...           1.0   
3        0.0        0.0        0.0         0.0  ...           0.0   
4        0.0        0.0        0.0         0.0  ...           1.0   

   cat__slope_1  cat__slope_2  cat__ca_1  cat__ca_2  cat__ca_3

## 3. Seperating Features and Target & Spliting Data

In [7]:
X= df.drop('target', axis=1)
y= df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (164, 21)
Testing set shape: (41, 21)


## 4. Model Development & Evaluation

In [11]:
models ={
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Grandient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"Precision: {precision_score(y_test, y_pred):.2f}")
    print(f"Recall: {recall_score(y_test, y_pred):.2f}")
    print(f"F1-Score: {f1_score(y_test, y_pred):.2f}")
    print("\n")

Model: Logistic Regression
Accuracy: 0.68
Precision: 0.70
Recall: 0.67
F1-Score: 0.68


Model: K-Nearest Neighbors
Accuracy: 0.73
Precision: 0.73
Recall: 0.76
F1-Score: 0.74


Model: Support Vector Machine
Accuracy: 0.73
Precision: 0.75
Recall: 0.71
F1-Score: 0.73


Model: Naive Bayes
Accuracy: 0.63
Precision: 0.59
Recall: 0.90
F1-Score: 0.72


Model: Decision Tree
Accuracy: 0.80
Precision: 0.84
Recall: 0.76
F1-Score: 0.80


Model: Random Forest
Accuracy: 0.73
Precision: 0.73
Recall: 0.76
F1-Score: 0.74


Model: Grandient Boosting
Accuracy: 0.76
Precision: 0.76
Recall: 0.76
F1-Score: 0.76


Model: XGBoost
Accuracy: 0.76
Precision: 0.79
Recall: 0.71
F1-Score: 0.75




## 5. Hyperparameter Tuning

### Hyperparameter tuning for Logistic Regression

In [13]:
print("\n=== Tuning Logistic Regression ===")

lr_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

param_grid_lr = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search_lr = GridSearchCV(estimator=lr_model, param_grid=param_grid_lr, 
                              cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search_lr.fit(X_train, y_train)

print("\nBest Parameters for Logistic Regression:")
print(grid_search_lr.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_lr.best_score_:.4f}")


=== Tuning Logistic Regression ===
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best Parameters for Logistic Regression:
{'C': 10, 'penalty': 'l2'}
Best Cross-Validation Accuracy: 0.8778


### Hyperparameter tuning for K-Nearest Neighbors(KNN)

In [15]:
print("\n=== Tuning K-Nearest Neighbors (KNN) ===")

knn_model = KNeighborsClassifier()

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search_knn = GridSearchCV(estimator=knn_model, param_grid=param_grid_knn, 
                               cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search_knn.fit(X_train, y_train)

print("\nBest Parameters for K-Nearest Neighbors:")
print(grid_search_knn.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_knn.best_score_:.4f}")


=== Tuning K-Nearest Neighbors (KNN) ===
Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best Parameters for K-Nearest Neighbors:
{'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
Best Cross-Validation Accuracy: 0.9019


### Hyperparameter tuning for Support Vector Machine(SVM)

In [17]:
print("\n=== Tuning Support Vector Machine (SVM) ===")

svm_model = SVC(random_state=42)

param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

grid_search_svm = GridSearchCV(estimator=svm_model, param_grid=param_grid_svm, 
                               cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search_svm.fit(X_train, y_train)

print("\nBest Parameters for Support Vector Machine:")
print(grid_search_svm.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_svm.best_score_:.4f}")


=== Tuning Support Vector Machine (SVM) ===
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best Parameters for Support Vector Machine:
{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best Cross-Validation Accuracy: 0.8839


### Hyperparameter tuning for Naive Bayes

In [19]:
print("\n=== Tuning Naive Bayes ===")

nb_model = GaussianNB()

param_grid_nb = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

grid_search_nb = GridSearchCV(estimator=nb_model, param_grid=param_grid_nb, 
                              cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search_nb.fit(X_train, y_train)

print("\nBest Parameters for Naive Bayes:")
print(grid_search_nb.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_nb.best_score_:.4f}")


=== Tuning Naive Bayes ===
Fitting 5 folds for each of 100 candidates, totalling 500 fits

Best Parameters for Naive Bayes:
{'var_smoothing': 0.02848035868435802}
Best Cross-Validation Accuracy: 0.8477


### Hyperparameter tuning for Decision Tree

In [21]:
print("=== Tuning Decision Tree ===")

dt_model = DecisionTreeClassifier(random_state=42)

param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(estimator=dt_model, param_grid=param_grid_dt, 
                              cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search_dt.fit(X_train, y_train)

print("\nBest Parameters for Decision Tree:")
print(grid_search_dt.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_dt.best_score_:.4f}")

=== Tuning Decision Tree ===
Fitting 5 folds for each of 90 candidates, totalling 450 fits

Best Parameters for Decision Tree:
{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Cross-Validation Accuracy: 0.8108


### Hyperparameter tuning for Random Forest

In [23]:
print("\n=== Tuning Random Forest ===")

rf_model = RandomForestClassifier(random_state=42)

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, 
                              cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search_rf.fit(X_train, y_train)

print("\nBest Parameters for Random Forest:")
print(grid_search_rf.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_rf.best_score_:.4f}")


=== Tuning Random Forest ===
Fitting 5 folds for each of 81 candidates, totalling 405 fits

Best Parameters for Random Forest:
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.8902


### Hyperparameter tuning for Gradient Boosting

In [25]:
print("\n=== Tuning Gradient Boosting ===")

gb_model = GradientBoostingClassifier(random_state=42)

param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

grid_search_gb = GridSearchCV(estimator=gb_model, param_grid=param_grid_gb, 
                              cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search_gb.fit(X_train, y_train)

print("\nBest Parameters for Gradient Boosting:")
print(grid_search_gb.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_gb.best_score_:.4f}")


=== Tuning Gradient Boosting ===
Fitting 5 folds for each of 27 candidates, totalling 135 fits

Best Parameters for Gradient Boosting:
{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50}
Best Cross-Validation Accuracy: 0.9025


### Hyperparameter tuning for XGBoost

In [27]:
print("\n=== Tuning XGBoost ===")

xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, 
                               cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search_xgb.fit(X_train, y_train)

print("\nBest Parameters for XGBoost:")
print(grid_search_xgb.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_xgb.best_score_:.4f}")


=== Tuning XGBoost ===
Fitting 5 folds for each of 243 candidates, totalling 1215 fits

Best Parameters for XGBoost:
{'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best Cross-Validation Accuracy: 0.9085


## 6. Final Evaluation Of Tuned Models

In [29]:
print("=== Evaluating Tuned Models on Test Set ===")

# Retrieving the best models found by GridSearchCV
best_lr_model = grid_search_lr.best_estimator_
best_knn_model = grid_search_knn.best_estimator_
best_svm_model = grid_search_svm.best_estimator_
best_nb_model = grid_search_nb.best_estimator_
best_dt_model = grid_search_dt.best_estimator_
best_rf_model = grid_search_rf.best_estimator_
best_gb_model = grid_search_gb.best_estimator_
best_xgb_model = grid_search_xgb.best_estimator_

# Creating a list of models for evaluation
models = {
    "Logistic Regression": best_lr_model,
    "KNN": best_knn_model,
    "SVM": best_svm_model,
    "Naive Bayes": best_nb_model,
    "Decision Tree": best_dt_model,
    "Random Forest": best_rf_model,
    "Gradient Boosting": best_gb_model,
    "XGBoost": best_xgb_model
}

# Evaluating each model
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n*** {name} Report (Test Set): ***")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
    print("--" * 30)

=== Evaluating Tuned Models on Test Set ===

*** Logistic Regression Report (Test Set): ***
Accuracy: 0.7561
              precision    recall  f1-score   support

           0       0.73      0.80      0.76        20
           1       0.79      0.71      0.75        21

    accuracy                           0.76        41
   macro avg       0.76      0.76      0.76        41
weighted avg       0.76      0.76      0.76        41

------------------------------------------------------------

*** KNN Report (Test Set): ***
Accuracy: 0.7805
              precision    recall  f1-score   support

           0       0.74      0.85      0.79        20
           1       0.83      0.71      0.77        21

    accuracy                           0.78        41
   macro avg       0.79      0.78      0.78        41
weighted avg       0.79      0.78      0.78        41

------------------------------------------------------------

*** SVM Report (Test Set): ***
Accuracy: 0.6829
              pre

## 8. Save All Hyper Tuned Model

In [33]:
os.makedirs("models", exist_ok=True)

models = {
    "logistic_regression": best_lr_model,
    "knn": best_knn_model,
    "svm": best_svm_model,
    "naive_bayes": best_nb_model,
    "decision_tree": best_dt_model,
    "random_forest": best_rf_model,
    "gradient_boosting": best_gb_model,
    "xgboost": best_xgb_model
}

for name, model in models.items():
    joblib.dump(model, f"../models/tuned/{name}.pkl")
    print(f"Saved {name} to models/tuned/{name}.pkl")

Saved logistic_regression to models/tuned/logistic_regression.pkl
Saved knn to models/tuned/knn.pkl
Saved svm to models/tuned/svm.pkl
Saved naive_bayes to models/tuned/naive_bayes.pkl
Saved decision_tree to models/tuned/decision_tree.pkl
Saved random_forest to models/tuned/random_forest.pkl
Saved gradient_boosting to models/tuned/gradient_boosting.pkl
Saved xgboost to models/tuned/xgboost.pkl
