#**Importing Libraries and Reading the Dataset**

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import confusion_matrix

import pickle

In [None]:
hd=pd.read_csv('/content/final_heart.csv')
hd.head()

Unnamed: 0,ca,chol,thal_3,slope_2,age,exang,trestbps,thal_2,oldpeak,thalach,target
0,2,-0.70194,True,True,-0.256357,0,-0.371077,False,-0.008573,0.801255,0
1,0,-0.892006,True,False,-0.145641,1,0.484371,False,1.907455,0.233473,0
2,0,-1.504441,True,False,1.736541,1,0.76952,False,1.451258,-1.076792,0
3,1,-0.892006,True,True,0.740092,0,0.94061,False,-0.920967,0.495526,0
4,3,1.029772,False,False,0.850808,0,0.370311,True,0.812582,-1.906626,0


# **Splitting Data for Supervised Learning Models**

In [None]:
X = hd.drop("target", axis=1)
y = hd["target"]
print("Shape of X:", X.shape)
print("\nShape of y:", y.shape)

Shape of X: (298, 10)

Shape of y: (298,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# **Splitting Data for Unsupervised Learning Models**

In [None]:
target = hd['target']
hd = hd.drop("target", axis=1).values  # keep only features

# **Logistic Regression Model**

In [None]:
logreg=LogisticRegression(max_iter=1000, C=0.8)

In [None]:
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9166666666666666
Confusion Matrix:
 [[26  1]
 [ 4 29]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.91        27
           1       0.97      0.88      0.92        33

    accuracy                           0.92        60
   macro avg       0.92      0.92      0.92        60
weighted avg       0.92      0.92      0.92        60



GridSearchCV

In [None]:
param_grid = {
    "C": [0.001, 0.01, 0.1, 0.8, 1, 10],
    "solver": ["lbfgs", "saga"],
    "penalty": ["l2"]
}
grid_logreg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring="accuracy")
grid_logreg.fit(X_train, y_train)
print("Best Logistic Regression:", grid_logreg.best_params_)
print("Best score:", grid_logreg.best_score_)

Best Logistic Regression: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score: 0.8109929078014184


RandomizedSearchCV

In [80]:
param_dist = {
    "C": np.logspace(-3, 2, 10),
    "solver": ["lbfgs", "saga"] ,
    "penalty": ["l2"]
}

# Randomized Search with 10 random combinations
rand_logreg = RandomizedSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)
rand_logreg.fit(X_train, y_train)

print("Best Parameters:", rand_logreg.best_params_)
print("Best F1 Score:", rand_logreg.best_score_)

#Saving The Best Model
with open('Best_Model.pkl', 'wb') as f:
    pickle.dump(rand_logreg.best_estimator_, f)
print('Best Model saved')
# to load the model later -->
# with open('Best_Model.pkl', 'rb') as f:
#    pickle.load(f)

Best Parameters: {'solver': 'saga', 'penalty': 'l2', 'C': np.float64(0.01291549665014884)}
Best F1 Score: 0.8193262411347518
Best Model saved


# **Decisoin Tree Classifier Model**

In [None]:
dtc = DecisionTreeClassifier(criterion="gini", max_depth=4, random_state=42)

In [None]:
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8
Confusion Matrix:
 [[23  4]
 [ 8 25]]
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.85      0.79        27
           1       0.86      0.76      0.81        33

    accuracy                           0.80        60
   macro avg       0.80      0.80      0.80        60
weighted avg       0.81      0.80      0.80        60



GridSearchCV

In [None]:
param_grid = {
    "max_depth": [2, 4, 6, 8, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
}
grid_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring="accuracy")
grid_dt.fit(X_train, y_train)
print("Best Decision Tree:", grid_dt.best_params_)
print("Best params:", grid_dt.best_params_)
print("Best score:", grid_dt.best_score_)

Best Decision Tree: {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best params: {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best score: 0.7398936170212765


RandomizedSearchCV

In [None]:
# Parameter distributions for Decision Tree
param_dist = {
    "max_depth": [None, 2, 4, 6, 8, 10, 20],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 6, 8],
    "criterion": ["gini", "entropy"]
}

# Randomized Search with 10 random combinations
rand_dt = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,        # number of random combinations to try
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

rand_dt.fit(X_train, y_train)

print("Best Parameters:", rand_dt.best_params_)
print("Best CV Accuracy:", rand_dt.best_score_)


Best Parameters: {'min_samples_split': 20, 'min_samples_leaf': 8, 'max_depth': 10, 'criterion': 'entropy'}
Best CV Accuracy: 0.7648936170212766


# **Random Forest Classifier Model**

In [None]:
rfc = RandomForestClassifier(max_depth=4,random_state=42)

In [None]:
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

print("Random Forest Results\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Random Forest Results

Accuracy: 0.8666666666666667

Confusion Matrix:
 [[24  3]
 [ 5 28]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86        27
           1       0.90      0.85      0.88        33

    accuracy                           0.87        60
   macro avg       0.87      0.87      0.87        60
weighted avg       0.87      0.87      0.87        60



GridSearchCV

In [None]:
param_grid = {
    "max_depth": [2, 4, 6, 8, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
}
grid_dt = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring="accuracy")
grid_dt.fit(X_train, y_train)
print("Best Decision Tree:", grid_dt.best_params_)
print("Best params:", grid_dt.best_params_)
print("Best score:", grid_dt.best_score_)

Best Decision Tree: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best params: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score: 0.8109929078014184


RandomizedSearchCV

In [77]:
param_dist_rf = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [4, 6, 8, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
    "bootstrap": [True, False]
}

rand_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=15,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

rand_rf.fit(X_train, y_train)

print("Best RF Params:", rand_rf.best_params_)
print("Best RF Accuracy:", rand_rf.best_score_)

Best RF Params: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 4, 'bootstrap': True}
Best RF Accuracy: 0.8065602836879433


# **Support Vector Classifier Model**

In [None]:
svm = SVC(probability=True,random_state=42)

In [None]:
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print("SVM Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

SVM Results
Accuracy: 0.8833333333333333
Confusion Matrix:
 [[24  3]
 [ 4 29]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.89      0.87        27
           1       0.91      0.88      0.89        33

    accuracy                           0.88        60
   macro avg       0.88      0.88      0.88        60
weighted avg       0.88      0.88      0.88        60



GridSearchCV

In [None]:
param_grid = {
    "C": [0.1, 1, 10, 100],
    "gamma": [0.001, 0.01, 0.1, 1],
    "kernel": ["linear", "rbf"]
}

# Grid Search
grid_svm = GridSearchCV(
    SVC(probability=True, random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_svm.fit(X_train, y_train)

print("Best Params (GridSearchCV):", grid_svm.best_params_)
print("Best CV Accuracy (GridSearchCV):", grid_svm.best_score_)

Best Params (GridSearchCV): {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
Best CV Accuracy (GridSearchCV): 0.8068262411347519


RandomizedSearchCV

In [None]:
param_dist = {
    "C": np.logspace(-2, 2, 10),
    "gamma": np.logspace(-3, 1, 10),
    "kernel": ["linear", "rbf"]
}

# Randomized Search
rand_svm = RandomizedSearchCV(
    SVC(probability=True, random_state=42),
    param_distributions=param_dist,
    n_iter=15,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

rand_svm.fit(X_train, y_train)

print("Best Params (RandomizedSearchCV):", rand_svm.best_params_)
print("Best CV Accuracy (RandomizedSearchCV):", rand_svm.best_score_)

Best Params (RandomizedSearchCV): {'kernel': 'linear', 'gamma': np.float64(10.0), 'C': np.float64(12.915496650148826)}
Best CV Accuracy (RandomizedSearchCV): 0.8067375886524821


# **K-Means Clustering**

In [None]:
wcss = []
K = range(1, 11)  # test k from 1 to 10
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(hd)
    wcss.append(kmeans.inertia_)  # inertia_ = WCSS
kmeans = KMeans (n_clusters=2, random_state=0)
kmeans.fit(hd)

RandomizedSearchCV

In [None]:
from sklearn.metrics import silhouette_score

sil_scores = {}
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(hd)
    sil_scores[k] = silhouette_score(hd, labels)

best_k = max(sil_scores, key=sil_scores.get)
print("Best k:", best_k)
print("Best Silhouette Score:", sil_scores[best_k])

Best k: 2
Best Silhouette Score: 0.20526859803574532


# **Hierarchical Clustering**

In [None]:
linked = linkage (hd, 'ward')
hc = AgglomerativeClustering(n_clusters=2, linkage="ward")
hc_labels = hc.fit_predict(hd)

print("Cluster labels:", hc_labels[:10])

Cluster labels: [0 1 1 0 1 0 1 0 0 1]


RandomizedSearchCV

In [None]:
linkages = ["ward", "complete", "average"]
best_score, best_params = -1, {}

for link in linkages:
    hc = AgglomerativeClustering(n_clusters=2, linkage=link)
    labels = hc.fit_predict(hd)
    score = silhouette_score(hd, labels)
    if score > best_score:
        best_score = score
        best_params = {"linkage": link}

print("Best params:", best_params)
print("Best Silhouette Score:", best_score)

Best params: {'linkage': 'average'}
Best Silhouette Score: 0.2635035754717412
