## 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

### Importing Classic ML models

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

## 2. Load Processed Data

In [5]:
try:
    df = pd.read_csv('../data/processed/cleaned_data.csv')
    print("Processed data loaded successfully.")
    print(df.head())
except FileNotFoundError:
    print("Error: 'cleaned_data.csv' not found in 'data/processed/'.")
    print("Please ensure you have run the preprocessing script first.")

Processed data loaded successfully.
        age  trestbps      chol   thalach   oldpeak  sex_1  cp_1  cp_2  cp_3  \
0 -0.267966 -0.376556 -0.667728  0.806035 -0.037124    1.0   0.0   0.0   0.0   
1 -0.157260  0.478910 -0.841918  0.237495  1.773958    1.0   0.0   0.0   0.0   
2  1.724733  0.764066 -1.403197 -1.074521  1.342748    1.0   0.0   0.0   0.0   
3  0.728383  0.935159 -0.841918  0.499898 -0.899544    1.0   0.0   0.0   0.0   
4  0.839089  0.364848  0.919336 -1.905464  0.739054    0.0   0.0   0.0   0.0   

   fbs_1  ...  slope_1  slope_2  ca_1  ca_2  ca_3  ca_4  thal_1  thal_2  \
0    0.0  ...      0.0      1.0   0.0   1.0   0.0   0.0     0.0     0.0   
1    1.0  ...      0.0      0.0   0.0   0.0   0.0   0.0     0.0     0.0   
2    0.0  ...      0.0      0.0   0.0   0.0   0.0   0.0     0.0     0.0   
3    0.0  ...      0.0      1.0   1.0   0.0   0.0   0.0     0.0     0.0   
4    1.0  ...      1.0      0.0   0.0   0.0   1.0   0.0     0.0     1.0   

   thal_3  target  
0     1.0   

## 3. Seperating Features and Target & Spliting Data

In [7]:
X= df.drop('target', axis=1)
y= df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (241, 22)
Testing set shape: (61, 22)


## 4. Model Development & Evaluation

In [9]:
models ={
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Grandient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"Precision: {precision_score(y_test, y_pred):.2f}")
    print(f"Recall: {recall_score(y_test, y_pred):.2f}")
    print(f"F1-Score: {f1_score(y_test, y_pred):.2f}")
    print("\n")

Model: Logistic Regression
Accuracy: 0.85
Precision: 0.88
Recall: 0.85
F1-Score: 0.86


Model: K-Nearest Neighbors
Accuracy: 0.80
Precision: 0.86
Recall: 0.76
F1-Score: 0.81


Model: Support Vector Machine
Accuracy: 0.80
Precision: 0.86
Recall: 0.76
F1-Score: 0.81


Model: Naive Bayes
Accuracy: 0.85
Precision: 0.85
Recall: 0.88
F1-Score: 0.87


Model: Decision Tree
Accuracy: 0.66
Precision: 0.68
Recall: 0.70
F1-Score: 0.69




[WinError 2] The system cannot find the file specified
  File "C:\Users\praja\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\praja\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\praja\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\praja\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Model: Random Forest
Accuracy: 0.82
Precision: 0.84
Recall: 0.82
F1-Score: 0.83


Model: Grandient Boosting
Accuracy: 0.77
Precision: 0.81
Recall: 0.76
F1-Score: 0.78


Model: XGBoost
Accuracy: 0.77
Precision: 0.83
Recall: 0.73
F1-Score: 0.77




## 5. Hyper Parameter Tuning For Logistic Regression

In [11]:
print("-- Starting Hyperparameter Tuning --")

log_reg= LogisticRegression(max_iter=2000, random_state=42)

# Defining the hyperparamter grid to search
# C--> Inverse of regularisation strength. Smaller values means stronger regularisation
# penalty--> Specifies the norm used in the penalisation.
# solver--> Algorithm to use in the optimisation problem.
param_grid_lr= {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# Setting up GridSeachCV
grid_search_lr= GridSearchCV(estimator=log_reg, param_grid=param_grid_lr, cv=5, scoring='accuracy', n_jobs= -1, verbose=1)

# Fitting the grid search to the data
grid_search_lr.fit(X_train, y_train)

print(f"Best Parameters for Logistic Regression: {grid_search_lr.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search_lr.best_score_:.4f}")

# Save best model found by the search
best_lr_model = grid_search_lr.best_estimator_

-- Starting Hyperparameter Tuning --
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters for Logistic Regression: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.8299


## 6. Hyper Paramater Tuning For Naive Bayes

In [13]:
print("-- Tuning Naive Bayes --")

nb_model= GaussianNB()

# Defining the hyperparameter
# var_smoothing--> A stability paramter. Helps when a feature has zero variance.
param_grid_nb={
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Setting up GridSeachCV
grid_search_nb= GridSearchCV(estimator=nb_model, param_grid=param_grid_nb, cv=5, scoring='accuracy', n_jobs= -1, verbose=1)

# Fitting the grid search to the data
grid_search_nb.fit(X_train, y_train)

print(f"Best Parameters for Naive Bayes: {grid_search_nb.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search_nb.best_score_:.4f}")

# Save best model found by the search
best_nb_model = grid_search_nb.best_estimator_

-- Tuning Naive Bayes --
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters for Naive Bayes: {'var_smoothing': 0.01}
Best Cross-Validation Accuracy: 0.8384


## 7. Final Evaluation Of Tuned Models

In [15]:
print("-- Evaluating Tuned Models on Test Set --")

# Retrieving the best models found by GridSearchCV
best_lr_model= grid_search_lr.best_estimator_
best_nb_model= grid_search_nb.best_estimator_

# Evaluating the tuned Logistic Regression model on the test set
y_pred_lr_tuned= best_lr_model.predict(X_test)
print("\n*** Tuned Logistic Regression Report (Test Set): ***")
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_lr_tuned):.4f}")
print(classification_report(y_test, y_pred_lr_tuned))

print("--"*30)

# Evaluating the tuned Naive Bayes model on the test set
y_pred_nb_tuned= best_nb_model.predict(X_test)
print("\n*** Tuned Naive Bayes Report (Test Set): ***")
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_nb_tuned):.4f}")
print(classification_report(y_test, y_pred_nb_tuned))

-- Evaluating Tuned Models on Test Set --

*** Tuned Logistic Regression Report (Test Set): ***

Accuracy: 0.8525
              precision    recall  f1-score   support

           0       0.83      0.86      0.84        28
           1       0.88      0.85      0.86        33

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61

------------------------------------------------------------

*** Tuned Naive Bayes Report (Test Set): ***

Accuracy: 0.8525
              precision    recall  f1-score   support

           0       0.83      0.86      0.84        28
           1       0.88      0.85      0.86        33

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61



## 8. Save The Best Tuned Model

In [17]:
best_model = grid_search_lr.best_estimator_

joblib.dump(best_model, '../models/final_model.pkl')
print("Best model (Tuned Logistic Regression) has been saved to '../models/final_model.pkl'.")

Best model (Tuned Logistic Regression) has been saved to '../models/final_model.pkl'.
