#**Importing Libraries and Reading the Dataset**

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
hd=pd.read_csv('/content/heart_preinc_scal.csv')
hd.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


# **Splitting Data for Supervised Learning Models**

In [8]:
X = hd.drop("target", axis=1)
y = hd["target"]
print("Shape of X:", X.shape)
print("\nShape of y:", y.shape)

Shape of X: (298, 13)

Shape of y: (298,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [5]:
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns

In [10]:
num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

# full preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# transform train/test
X_train_final = preprocessor.fit_transform(X_train)
X_test_final  = preprocessor.transform(X_test)

print("Train final shape:", X_train_final.shape)
print("Test final shape:", X_test_final.shape)


Train final shape: (238, 13)
Test final shape: (60, 13)


# **Splitting Data for Unsupervised Learning Models**

In [13]:
target = hd['target']
X = hd.drop("target", axis=1)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# Apply preprocessing on the entire dataset
X_final = preprocessor.fit_transform(X)
print("Final dataset shape:", X_final.shape)

Final dataset shape: (298, 13)


# **Logistic Regression Model**

In [14]:
logreg=LogisticRegression(max_iter=1000, random_state=1)

In [15]:
logreg.fit(X_train_final, y_train)

y_pred = logreg.predict(X_test_final)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9
Confusion Matrix:
 [[25  2]
 [ 4 29]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.93      0.89        27
           1       0.94      0.88      0.91        33

    accuracy                           0.90        60
   macro avg       0.90      0.90      0.90        60
weighted avg       0.90      0.90      0.90        60



GridSearchCV

In [44]:
param_grid = {
    "C": [0.001, 0.01, 0.1, 0.8, 1, 10],
}
grid_logreg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring="accuracy")
grid_logreg.fit(X_train_final, y_train)
print("Best Logistic Regression:", grid_logreg.best_params_)
print("Best score:", grid_logreg.best_score_)

Best Logistic Regression: {'C': 0.1}
Best score: 0.840336879432624


RandomizedSearchCV

In [48]:
param_dist = {
    "C": np.logspace(-3, 2, 10),
    "solver": ["lbfgs", "saga"] ,
    "penalty": ["l2"]
}

# Randomized Search with 10 random combinations
rand_logreg = RandomizedSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)
rand_logreg.fit(X_train_final, y_train)

print("Best Parameters:", rand_logreg.best_params_)
print("Best CV Accuracy:", rand_logreg.best_score_)

#Saving The Best Model
with open('Best_Model.pkl', 'wb') as f:
    pickle.dump(rand_logreg.best_estimator_, f)
print('Best Model saved')
# to load the model later -->
# with open('Best_Model.pkl', 'rb') as f:
#    pickle.load(f)

Best Parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': np.float64(0.1668100537200059)}
Best CV Accuracy: 0.8404255319148936
Best Model saved


# **Decisoin Tree Classifier Model**

In [21]:
dtc = DecisionTreeClassifier(random_state=1)

In [22]:
dtc.fit(X_train_final, y_train)

y_pred = dtc.predict(X_test_final)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8166666666666667
Confusion Matrix:
 [[23  4]
 [ 7 26]]
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.85      0.81        27
           1       0.87      0.79      0.83        33

    accuracy                           0.82        60
   macro avg       0.82      0.82      0.82        60
weighted avg       0.82      0.82      0.82        60



GridSearchCV

In [45]:
param_grid = {
    "max_depth": [2, 3, 4, 6, 8, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
}
grid_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring="accuracy")
grid_dt.fit(X_train_final, y_train)
y_pred = grid_dt.predict(X_test_final)

print("Best Decision Tree:", grid_dt.best_params_)
print("Best params:", grid_dt.best_params_)
print("Best score:", grid_dt.best_score_)

Best Decision Tree: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best params: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score: 0.7603723404255319


RandomizedSearchCV

In [24]:
# Parameter distributions for Decision Tree
param_dist = {
    "max_depth": [None, 2, 4, 6, 8, 10, 20],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 6, 8],
    "criterion": ["gini", "entropy"]
}

# Randomized Search with 10 random combinations
rand_dt = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,        # number of random combinations to try
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

rand_dt.fit(X_train, y_train)

print("Best Parameters:", rand_dt.best_params_)
print("Best CV Accuracy:", rand_dt.best_score_)


Best Parameters: {'min_samples_split': 20, 'min_samples_leaf': 6, 'max_depth': None, 'criterion': 'entropy'}
Best CV Accuracy: 0.7603723404255319


# **Random Forest Classifier Model**

In [25]:
rfc = RandomForestClassifier(max_depth=4,random_state=42)

In [26]:
rfc.fit(X_train_final, y_train)
y_pred = rfc.predict(X_test_final)

print("Random Forest Results\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Random Forest Results

Accuracy: 0.9

Confusion Matrix:
 [[25  2]
 [ 4 29]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.93      0.89        27
           1       0.94      0.88      0.91        33

    accuracy                           0.90        60
   macro avg       0.90      0.90      0.90        60
weighted avg       0.90      0.90      0.90        60



GridSearchCV

In [46]:
param_grid = {
    "max_depth": [2, 4, 6, 8, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"]
}
grid_dt = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring="accuracy")
grid_dt.fit(X_train, y_train)
print("Best Random Tree:", grid_dt.best_params_)
print("Best params:", grid_dt.best_params_)
print("Best score:", grid_dt.best_score_)

Best Random Tree: {'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best params: {'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score: 0.836081560283688


RandomizedSearchCV

In [28]:
param_dist_rf = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [4, 6, 8, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
    "bootstrap": [True, False]
}

rand_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=15,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

rand_rf.fit(X_train, y_train)

print("Best RF Params:", rand_rf.best_params_)
print("Best RF Accuracy:", rand_rf.best_score_)

Best RF Params: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}
Best RF Accuracy: 0.8317375886524824


# **Support Vector Classifier Model**

In [29]:
svm = SVC(probability=True,random_state=42)

In [30]:
svm.fit(X_train_final, y_train)
y_pred = svm.predict(X_test_final)

print("SVM Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

SVM Results
Accuracy: 0.9
Confusion Matrix:
 [[26  1]
 [ 5 28]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.96      0.90        27
           1       0.97      0.85      0.90        33

    accuracy                           0.90        60
   macro avg       0.90      0.91      0.90        60
weighted avg       0.91      0.90      0.90        60



GridSearchCV

In [43]:
param_grid = {
    "C": [0.1, 1, 10, 100],
    "gamma": [0.001, 0.01, 0.1, 1],
    "kernel": ["linear", "rbf"]
}

# Grid Search
grid_svm = GridSearchCV(
    SVC(probability=True, random_state=42),
    param_grid=param_grid,
    cv=4,
    scoring="accuracy",
    n_jobs=-1
)

grid_svm.fit(X_train, y_train)

print("Best Params (GridSearchCV):", grid_svm.best_params_)
print("Best CV Accuracy (GridSearchCV):", grid_svm.best_score_)

KeyboardInterrupt: 

RandomizedSearchCV

In [None]:
param_dist = {
    "C": np.logspace(-2, 2, 10),
    "gamma": np.logspace(-3, 1, 10),
    "kernel": ["linear", "rbf"]
}

# Randomized Search
rand_svm = RandomizedSearchCV(
    SVC(probability=True, random_state=42),
    param_distributions=param_dist,
    n_iter=15,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

rand_svm.fit(X_train, y_train)

print("Best Params (RandomizedSearchCV):", rand_svm.best_params_)
print("Best CV Accuracy (RandomizedSearchCV):", rand_svm.best_score_)

# **K-Means Clustering**

In [33]:
wcss = []
K = range(1, 11)  # test k from 1 to 10
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_final)
    wcss.append(kmeans.inertia_)
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans.fit(X_final)

# Cluster labels
labels = kmeans.labels_

RandomizedSearchCV

In [35]:
from sklearn.metrics import silhouette_score

sil_scores = {}
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_final)
    sil_scores[k] = silhouette_score(X_final, labels)

best_k = max(sil_scores, key=sil_scores.get)
print("Best k:", best_k)
print("Best Silhouette Score:", sil_scores[best_k])

Best k: 2
Best Silhouette Score: 0.16692934032341614


# **Hierarchical Clustering**

In [37]:
hc = AgglomerativeClustering(n_clusters=2, linkage="ward")
hc_labels = hc.fit_predict(X_final.toarray() if hasattr(X_final, "toarray") else X_final)

print("Cluster labels (first 10):", hc_labels[:10])

Cluster labels (first 10): [0 1 1 0 1 0 1 1 0 1]


RandomizedSearchCV

In [39]:
linkages = ["ward", "complete", "average"]
best_score, best_params = -1, {}

for link in linkages:
    hc = AgglomerativeClustering(n_clusters=2, linkage=link)
    labels = hc.fit_predict(X_final)
    score = silhouette_score(X_final, labels)
    if score > best_score:
        best_score = score
        best_params = {"linkage": link}

print("Best params:", best_params)
print("Best Silhouette Score:", best_score)

Best params: {'linkage': 'average'}
Best Silhouette Score: 0.2963335972155111


# **Creating and Saving The Pipeline**

In [49]:
# import pickle
# import pandas as pd
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.pipeline import Pipeline
# from sklearn.base import BaseEstimator, TransformerMixin

# # Final chosen features (must exist after preprocessing)
# top_features = ['ca', 'chol', 'thal_3', 'slope_2', 'age',
#                 'exang', 'trestbps', 'thal_2', 'oldpeak', 'thalach']

# # --- Custom transformer to select only top_features ---
# class FeatureSelector(BaseEstimator, TransformerMixin):
#     def __init__(self, feature_names):
#         self.feature_names = feature_names

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         # Ensure DataFrame for column selection
#         X_df = pd.DataFrame(X, columns=self.feature_names_all)
#         return X_df[self.feature_names]

#     def set_output_names(self, feature_names):
#         self.feature_names_all = feature_names

# # Raw data column groups
# num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']   # scale
# cat_cols = ['cp', 'restecg', 'slope', 'thal']                  # encode
# bin_cols = ['ca', 'exang']                                     # passthrough

# # Preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", StandardScaler(), num_cols),
#         ("cat", OneHotEncoder(drop='first', sparse_output=False), cat_cols),
#         ("bin", "passthrough", bin_cols)
#     ]
# )

# # Load best model
# with open("Best_Model.pkl", "rb") as f:
#     best_model = pickle.load(f)

# # Build pipeline
# pipeline = Pipeline(steps=[
#     ("preprocessor", preprocessor),
#     ("model", best_model)
# ])

# # Fit pipeline on raw dataset
# data = pd.read_csv("heart.csv")
# X = data.drop("target", axis=1)
# y = data["target"]

# pipeline.fit(X, y)

# # Save pipeline
# with open("Best_Pipeline.pkl", "wb") as f:
#     pickle.dump(pipeline, f)

# print("Pipeline trained and saved successfully!")


FileNotFoundError: [Errno 2] No such file or directory: 'heart.csv'

In [52]:
import joblib
joblib.dump(rand_logreg.best_estimator_, "final_model.pkl")
print("Best Model saved as final_model.pkl")

Best Model saved as final_model.pkl
