# This is my model building code for my end to end project. 
# I hope this code as well as the comments will help you understand the steps of model building. 
# Also I am very very open minded, so any feedback is welcomed!

In [None]:
# Running this command to activate autocomplete.

%config Completer.use_jedi = False

In [None]:
# Importing the necessary libraries.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.impute import KNNImputer
from eli5.sklearn import PermutationImportance
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix ,ConfusionMatrixDisplay
import pickle

In [None]:
# Reading the data.

data = pd.read_csv("../input/heart-disease-prediction-using-logistic-regression/framingham.csv")

In [None]:
# Having a look at the data.

print(data.head())
print("========================================================")
print(data.columns)
print("========================================================")
print(data.describe())
print("========================================================")
print(data.isnull().sum())
print("========================================================")
print(data.shape)
print("========================================================")
print(data.dtypes)

In [None]:
# Checking if the data is imbalanced.

sns.displot(data.TenYearCHD)

Our data is highly imbalaced.

# Handling missing values & Feature Engineering.

In [None]:
# Visualizing the NAN s on a heatmap

plt.figure(figsize=(12,8))
sns.heatmap(data.isnull())

Imputing the "education" feature.

In [None]:
data["education"].describe()

In [None]:
plt.figure()
sns.displot(data.education, kde=True)

In [None]:
# Using a creative way to distinguish and treat null values specially.
# creating a "Unknown - 5" because number of null values are high.

data["education_nan"] = np.where(data.education.isnull(),1,0)
data["education"].fillna(5, inplace=True)

Imputing the "cigsPerDay" feature.

In [None]:
data.cigsPerDay.describe()

In [None]:
plt.figure()
sns.displot(data.cigsPerDay, kde=True)

In [None]:
data["cigsPerDay"].fillna(9, inplace=True)

Imputing the "BPMeds" feature.

In [None]:
data.BPMeds.describe()

In [None]:
plt.figure()
sns.displot(data.BPMeds, kde=True)

In [None]:
data["BPMeds_nan"] = np.where(data.BPMeds.isnull(),1,0)
data["BPMeds"].fillna(0, inplace=True)

Imputing the "totChol" features

In [None]:
data.totChol.describe()

In [None]:
plt.figure()
sns.displot(data.totChol, kde=True)

In [None]:
data["totChol_nan"] = np.where(data.totChol.isnull(),1,0)
data["totChol"].fillna(data.totChol.median(), inplace=True)

Imputing the "BMI" features

In [None]:
data.BMI.describe()

In [None]:
plt.figure()
sns.displot(data.BMI, kde=True)

In [None]:
data["BMI"].fillna(data.BMI.mean(), inplace=True)

Imputing the "glucose" feature

In [None]:
data.glucose.describe()

In [None]:
plt.figure()
sns.displot(data.glucose, kde=True)

In [None]:
# Has high number of null values, so using a creative way to distinguish and treat null values specially.
# Using median as big outliers are present.

data["glucose_nan"] = np.where(data.glucose.isnull(),1,0)
data["glucose"].fillna(data.glucose.median(), inplace=True)

Imputing the "heartRate" feature

In [None]:
data.heartRate.describe()

In [None]:
plt.figure()
sns.displot(data.heartRate, kde=True)

In [None]:
data["heartRate"].fillna(data.heartRate.median(), inplace=True)

# Modelling

In [None]:
# dividing the data into features and target variable.
x = data.drop(columns="TenYearCHD")
y = data.TenYearCHD

# Fixing the imbalanced data by random oversampling, as our dataset is small.
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(x, y)

# Splitting the data into features and target variable.
train_x, test_x, train_y, test_y = train_test_split(X_resampled, y_resampled, test_size=0.25)

In [None]:
# balanced dataset
sns.displot(y_resampled)

In [None]:
# Creating pipelies.

pipe1 = Pipeline([("robust_scalar", RobustScaler()),("std_scalar", StandardScaler()), ("logistic_regression", LogisticRegression())])

pipe2 = Pipeline([("robust_scalar", RobustScaler()),("std_scalar", StandardScaler()), ("KNN", KNeighborsClassifier())])

pipe3 = Pipeline([("robust_scalar", RobustScaler()),("std_scalar", StandardScaler()), ("svm", SVC())])

pipe4 = Pipeline([("robust_scalar", RobustScaler()),("std_scalar", StandardScaler()), ("XGboost", XGBClassifier())])

pipe5 = Pipeline([("robust_scalar", RobustScaler()),("std_scalar", StandardScaler()), ("decision_tree", DecisionTreeClassifier())])

pipe6 = Pipeline([("robust_scalar", RobustScaler()),("std_scalar", StandardScaler()), ("random_forest", RandomForestClassifier())])

In [None]:
# Fitting the pipelines

pipelines = [pipe1, pipe2, pipe3, pipe4, pipe5, pipe6]

for pipe in pipelines:
    pipe.fit(train_x, train_y)

In [None]:
# Predicting

pred1 = pipe1.predict(test_x)
pred2 = pipe2.predict(test_x)
pred3 = pipe3.predict(test_x)
pred4 = pipe4.predict(test_x)
pred5 = pipe5.predict(test_x)
pred6 = pipe6.predict(test_x)

In [None]:
# Comparing the result of each pipeline and selecting the best pipeline. More emphasis is given to recall as we really 
# want the false negative to be as low as possible.

print("Accuracy of Logistic_Regression", round(accuracy_score(test_y, pred1)*100, 2), "%")
print("Recall of Logistic_Regression", round(recall_score(test_y, pred1),2))
print("===================================================================")
print("Accuracy of KNN", round(accuracy_score(test_y, pred2)*100, 2), "%")
print("Recall of KNN", round(recall_score(test_y, pred2),2))
print("===================================================================")
print("Accuracy of SVC", round(accuracy_score(test_y, pred3)*100,2), "%")
print("Recall of SVC", round(recall_score(test_y, pred3),2))
print("===================================================================")
print("Accuracy of xgboost", round(accuracy_score(test_y, pred4)*100,2), "%")
print("Recall of xgboost", round(recall_score(test_y, pred4),2))
print("===================================================================")
print("Accuracy of decision_tree", round(accuracy_score(test_y, pred5)*100,2), "%")
print("Recall of decision_tree", round(recall_score(test_y, pred5),2))
print("===================================================================")
print("Accuracy of Random_forest", round(accuracy_score(test_y, pred6)*100,2), "%")
print("Recall of Random_forest", round(recall_score(test_y, pred6),2))

Since Decision tree has the highest recall, therfore we will go with Decision Tree.
High recall means low False Negative.
Lower the false negative in this case, better it will perform in real life.

In [None]:
# Visualizing the confusion matrix. (Our aim is to reduce false negative)

cm = confusion_matrix(test_y, pred5)
display = ConfusionMatrixDisplay(confusion_matrix=cm)
display.plot()

# Feature Selection

In [None]:
# Trying another method to reduce the unnecessary features

obj = SelectKBest(f_regression, k=4)
new_data = obj.fit_transform(x,y)

filter = obj.get_support()
feature = x.columns
final_f = feature[filter]
print(feature[filter])

# Hyparameter Tuning

In [None]:
# Looking at the estimator's parameters

pipe5.get_params().keys()

In [None]:
# Using randomized search cv to get the best parameter values

params = {
    'decision_tree__max_leaf_nodes'       : [1,2,4,6,8,12,15],
    #'decision_tree__max_features'         : ['auto', 'sqrt', 'log2'],
    'decision_tree__random_state'         : [42],
    'decision_tree__max_depth'            : [2, 3, 5, 10, 20],
    'decision_tree__min_samples_leaf'     : [5, 10, 20, 50, 100],
    'decision_tree__criterion'            : ["gini", "entropy"]
    }

from sklearn.model_selection import RandomizedSearchCV

final_model = RandomizedSearchCV(pipe5, param_distributions=params, n_iter=3, cv=3)

final_model.fit(train_x[final_f], train_y)

In [None]:
# All the parameter values selected in the above process
final_model.best_params_

In [None]:
# prediction with test data
prediction = final_model.predict(test_x[final_f])

In [None]:
# visualizing the predicted values

plt.figure(figsize=(18,10))
sns.displot(prediction)

In [None]:
# Looking at the metrics

print("Accuracy is", accuracy_score(test_y, prediction))
print("Recall is", recall_score(test_y, prediction))
print("Precision is", precision_score(test_y, prediction))

In [None]:
# Visualizing confusion matrix

cm = confusion_matrix(test_y, prediction)
display = ConfusionMatrixDisplay(confusion_matrix=cm)
display.plot()

In [None]:
# Manipulating our predicted values based on probabilities. This is done to reduce False Negative.
probalities = final_model.predict_proba(test_x[final_f])
prediction_prob = np.where(probalities>=0.30, 1, 0)[:,1]

# Visualizing Confusion matrix
cm = confusion_matrix(test_y, prediction_prob)
display_cm = ConfusionMatrixDisplay(confusion_matrix = cm)
display_cm.plot()

Now we have very low False negative of 73.

In [None]:
# Finally saving our model as a pickel file. (For deployment)
pickle.dump(final_model, open('model.pkl','wb'))

# A very important note: Calculating probability in Decision Tree is meaningless because this model works on simple decision making and not Probability. I am able to get the probability because i have assigned values to min_smaples_leaf.

# This is just to give you an idea how you can reduce False Negative or False Positive.