## 1. Import Libraries

In [1]:
import os
import sys
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

### Importing Classic ML models

In [3]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

### Import Suppoting file

In [5]:
current_dir= os.path.abspath('')

project_root= os.path.dirname(current_dir)
if project_root not in sys.path:
    sys.path.append(project_root)

from scripts.preprocessing import create_preprocessor
from scripts.model_utils import load_data, save_pipeline

## 2. Load The Data

In [7]:
try:
    df = pd.read_csv('../data/raw/heart.csv')
    print("Data loaded successfully.")
    print(df.head())
except FileNotFoundError:
    print("Error: 'heart.csv' not found in 'data/raw/'.")
    print("Please ensure the raw data is in the '../data/raw' directory.")

Data loaded successfully.
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  


## 3. Spliting Data

### Spliting Data into training+validation and final test sets

In [9]:
X= df.drop('target', axis=1)
y= df['target']

X_train_val, X_final_test, y_train_val, y_final_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Data splited into training+validation and final test set")

Data splited into training+validation and final test set


### Spliting training+validation data into seperate sets

In [11]:
X_train, X_val, y_train, y_val= train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

### Save the unseen test data to be used in the final evaluation 

In [13]:
final_test_df = X_final_test.copy()
final_test_df['target'] = y_final_test
final_test_output_path = '../data/processed/final_test_data.csv'
os.makedirs(os.path.dirname(final_test_output_path), exist_ok=True)
final_test_df.to_csv(final_test_output_path, index=False)
print(f"Final unseen test data saved to {final_test_output_path}")

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Final Test set shape: {X_final_test.shape}")

Final unseen test data saved to ../data/processed/final_test_data.csv
Training set shape: (615, 13)
Validation set shape: (205, 13)
Final Test set shape: (205, 13)


## 4. Model Development & Evaluation

### Evaluation of Base Model Performance

In [17]:
preprocessor= create_preprocessor()

models ={
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(random_state=42),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Grandient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

for model_name, model in models.items():
    pipeline= Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred= pipeline.predict(X_val) # Predicts on validation set
    
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.2f}")
    print(f"Precision: {precision_score(y_val, y_pred):.2f}")
    print(f"Recall: {recall_score(y_val, y_pred):.2f}")
    print(f"F1-Score: {f1_score(y_val, y_pred):.2f}")
    print("\n")

Model: Logistic Regression
Accuracy: 0.84
Precision: 0.86
Recall: 0.83
F1-Score: 0.84


Model: K-Nearest Neighbors
Accuracy: 0.81
Precision: 0.81
Recall: 0.84
F1-Score: 0.82


Model: Support Vector Machine
Accuracy: 0.89
Precision: 0.90
Recall: 0.89
F1-Score: 0.89


Model: Naive Bayes
Accuracy: 0.84
Precision: 0.85
Recall: 0.85
F1-Score: 0.85


Model: Decision Tree
Accuracy: 0.95
Precision: 0.95
Recall: 0.94
F1-Score: 0.95


Model: Random Forest
Accuracy: 0.98
Precision: 1.00
Recall: 0.96
F1-Score: 0.98


Model: Grandient Boosting
Accuracy: 0.94
Precision: 0.94
Recall: 0.93
F1-Score: 0.94


Model: XGBoost
Accuracy: 0.95
Precision: 0.94
Recall: 0.96
F1-Score: 0.95




### Analysis of Base Models
* Based on the baseline evaluation, the Random Forest model with its default parameters shows the strongest performance, achieving a high Accuracy of 0.98, a perfect Precision of 1.00, a strong Recall of 0.96, and a top-tier F1-Score of 0.98.
* The Decision Tree and XGBoost models also perform well, with high scores across all metrics. 
* This indicates that ensemble methods and tree-based models are well-suited for this dataset, even before any hyperparameter tuning is performed.

## 5. Hyperparameter Tuning

### Hyperparameter tuning for Logistic Regression

In [19]:
print("\n=== Tuning Logistic Regression ===")

preprocessor= create_preprocessor()
pipeline_lr= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

param_grid_lr = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}

grid_search_lr = GridSearchCV(estimator=pipeline_lr, param_grid=param_grid_lr, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_lr.fit(X_train, y_train)

print("\nBest Parameters for Logistic Regression:")
print(grid_search_lr.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_lr.best_score_:.4f}")


=== Tuning Logistic Regression ===
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best Parameters for Logistic Regression:
{'classifier__C': 10, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.8683


### Hyperparameter tuning for K-Nearest Neighbors(KNN)

In [21]:
print("\n=== Tuning K-Nearest Neighbors (KNN) ===")

preprocessor= create_preprocessor()
pipeline_knn= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

param_grid_knn = {
    'classifier__n_neighbors': [3, 5, 7, 9, 11],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search_knn = GridSearchCV(estimator=pipeline_knn, param_grid=param_grid_knn, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_knn.fit(X_train, y_train)

print("\nBest Parameters for K-Nearest Neighbors:")
print(grid_search_knn.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_knn.best_score_:.4f}")


=== Tuning K-Nearest Neighbors (KNN) ===
Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best Parameters for K-Nearest Neighbors:
{'classifier__metric': 'euclidean', 'classifier__n_neighbors': 11, 'classifier__weights': 'distance'}
Best Cross-Validation Accuracy: 0.9561


### Hyperparameter tuning for Support Vector Machine(SVM)

In [23]:
print("\n=== Tuning Support Vector Machine (SVM) ===")

preprocessor= create_preprocessor()
pipeline_svm= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=42, probability=True))
])
                       
param_grid_svm = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['linear', 'rbf', 'poly'],
    'classifier__gamma': ['scale', 'auto']
}

grid_search_svm = GridSearchCV(estimator=pipeline_svm, param_grid=param_grid_svm, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

print("\nBest Parameters for Support Vector Machine:")
print(grid_search_svm.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_svm.best_score_:.4f}")


=== Tuning Support Vector Machine (SVM) ===
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best Parameters for Support Vector Machine:
{'classifier__C': 100, 'classifier__gamma': 'scale', 'classifier__kernel': 'poly'}
Best Cross-Validation Accuracy: 0.9480


### Hyperparameter tuning for Naive Bayes

In [25]:
print("\n=== Tuning Naive Bayes ===")

preprocessor= create_preprocessor()
pipeline_nb= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])

param_grid_nb = {
    'classifier__var_smoothing': np.logspace(0, -9, num=100)
}

grid_search_nb = GridSearchCV(estimator=pipeline_nb, param_grid=param_grid_nb, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_nb.fit(X_train, y_train)

print("\nBest Parameters for Naive Bayes:")
print(grid_search_nb.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_nb.best_score_:.4f}")


=== Tuning Naive Bayes ===
Fitting 5 folds for each of 100 candidates, totalling 500 fits

Best Parameters for Naive Bayes:
{'classifier__var_smoothing': 0.1873817422860384}
Best Cross-Validation Accuracy: 0.8439


### Hyperparameter tuning for Decision Tree

In [27]:
print("=== Tuning Decision Tree ===")

preprocessor= create_preprocessor()
pipeline_dt= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

param_grid_dt = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [3, 5, 7, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(estimator=pipeline_dt, param_grid=param_grid_dt, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

print("\nBest Parameters for Decision Tree:")
print(grid_search_dt.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_dt.best_score_:.4f}")

=== Tuning Decision Tree ===
Fitting 5 folds for each of 90 candidates, totalling 450 fits

Best Parameters for Decision Tree:
{'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}
Best Cross-Validation Accuracy: 0.9398


### Hyperparameter tuning for Random Forest

In [29]:
print("\n=== Tuning Random Forest ===")

preprocessor= create_preprocessor()
pipeline_rf= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(estimator=pipeline_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

print("\nBest Parameters for Random Forest:")
print(grid_search_rf.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_rf.best_score_:.4f}")


=== Tuning Random Forest ===
Fitting 5 folds for each of 81 candidates, totalling 405 fits

Best Parameters for Random Forest:
{'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best Cross-Validation Accuracy: 0.9528


### Hyperparameter tuning for Gradient Boosting

In [31]:
print("\n=== Tuning Gradient Boosting ===")

preprocessor= create_preprocessor()
pipeline_gb= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

param_grid_gb = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7]
}

grid_search_gb = GridSearchCV(estimator=pipeline_gb, param_grid=param_grid_gb, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)

print("\nBest Parameters for Gradient Boosting:")
print(grid_search_gb.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_gb.best_score_:.4f}")


=== Tuning Gradient Boosting ===
Fitting 5 folds for each of 27 candidates, totalling 135 fits

Best Parameters for Gradient Boosting:
{'classifier__learning_rate': 0.2, 'classifier__max_depth': 5, 'classifier__n_estimators': 50}
Best Cross-Validation Accuracy: 0.9593


### Hyperparameter tuning for XGBoost

In [33]:
print("\n=== Tuning XGBoost ===")

preprocessor= create_preprocessor()
pipeline_xgb= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42))
])

param_grid_xgb = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.7, 0.8, 1.0],
    'classifier__colsample_bytree': [0.7, 0.8, 1.0]
}

grid_search_xgb = GridSearchCV(estimator=pipeline_xgb, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)

print("\nBest Parameters for XGBoost:")
print(grid_search_xgb.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search_xgb.best_score_:.4f}")


=== Tuning XGBoost ===
Fitting 5 folds for each of 243 candidates, totalling 1215 fits

Best Parameters for XGBoost:
{'classifier__colsample_bytree': 0.7, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 200, 'classifier__subsample': 0.7}
Best Cross-Validation Accuracy: 0.9577


## 6. Final Evaluation Of Tuned Models

In [35]:
print("=== Evaluating Tuned Models on Test Set ===")

# Retrieving the best models found by GridSearchCV
best_lr_model = grid_search_lr.best_estimator_
best_knn_model = grid_search_knn.best_estimator_
best_svm_model = grid_search_svm.best_estimator_
best_nb_model = grid_search_nb.best_estimator_
best_dt_model = grid_search_dt.best_estimator_
best_rf_model = grid_search_rf.best_estimator_
best_gb_model = grid_search_gb.best_estimator_
best_xgb_model = grid_search_xgb.best_estimator_

# Creating a list of models for evaluation
models = {
    "Logistic Regression": best_lr_model,
    "KNN": best_knn_model,
    "SVM": best_svm_model,
    "Naive Bayes": best_nb_model,
    "Decision Tree": best_dt_model,
    "Random Forest": best_rf_model,
    "Gradient Boosting": best_gb_model,
    "XGBoost": best_xgb_model
}

# Evaluating each model
for name, model in models.items():
    y_pred = model.predict(X_val)
    print(f"\n*** {name} Report (Validation Set): ***")
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
    print(classification_report(y_val, y_pred))
    print("--" * 30)

=== Evaluating Tuned Models on Test Set ===

*** Logistic Regression Report (Validation Set): ***
Accuracy: 0.8585
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       100
           1       0.88      0.84      0.86       105

    accuracy                           0.86       205
   macro avg       0.86      0.86      0.86       205
weighted avg       0.86      0.86      0.86       205

------------------------------------------------------------

*** KNN Report (Validation Set): ***
Accuracy: 0.9805
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       100
           1       1.00      0.96      0.98       105

    accuracy                           0.98       205
   macro avg       0.98      0.98      0.98       205
weighted avg       0.98      0.98      0.98       205

------------------------------------------------------------

*** SVM Report (Validation Set): ***
Accuracy: 0.9707

### Analysis of Tuned Models
* Top Performers: KNN and Random Forest are tied for the highest accuracy at 0.9805. Both models also have an F1-score of 0.98 and a perfect precision of 1.00 for the positive class.

* Strong Contenders: Gradient Boosting and SVM follow closely behind, with high accuracy and F1-scores as well.

* Average Performers: Logistic Regression and Naive Bayes show lower performance compared to the other models, suggesting they may not be the best fit for this dataset.

## 8. Save All Hyper Tuned Model

In [37]:
os.makedirs("models", exist_ok=True)

models_to_save = {
    "logistic_regression": best_lr_model,
    "knn": best_knn_model,
    "svm": best_svm_model,
    "naive_bayes": best_nb_model,
    "decision_tree": best_dt_model,
    "random_forest": best_rf_model,
    "gradient_boosting": best_gb_model,
    "xgboost": best_xgb_model
}

for name, model in models_to_save.items():
    joblib.dump(model, f"../models/tuned/{name}.pkl")
    print(f"Saved {name} to models/tuned/{name}.pkl")

Saved logistic_regression to models/tuned/logistic_regression.pkl
Saved knn to models/tuned/knn.pkl
Saved svm to models/tuned/svm.pkl
Saved naive_bayes to models/tuned/naive_bayes.pkl
Saved decision_tree to models/tuned/decision_tree.pkl
Saved random_forest to models/tuned/random_forest.pkl
Saved gradient_boosting to models/tuned/gradient_boosting.pkl
Saved xgboost to models/tuned/xgboost.pkl
