# **Import Libraries**

In [None]:
#loading dataset
import pandas as pd
import numpy as np
#visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#EDA
from collections import Counter
# data preprocessing
from sklearn.preprocessing import StandardScaler
# data splitting
from sklearn.model_selection import train_test_split
# data modeling
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
#ensembling
import warnings
warnings.filterwarnings("ignore")

# Import data

In [None]:
data = pd.read_csv (r'../input/heart-disease-uci/heart.csv')

In [None]:
df=data

In [None]:
df.head(5)

In [None]:
df.tail(5)

# Checking the datatypes and shape of the data:


In [None]:
df.dtypes

# Checking for duplicate rows:


In [None]:
duplicate_rows_df = df[df.duplicated()]
print(duplicate_rows_df.shape)

In [None]:
# Checking for null values:


In [None]:
print(df.isnull().sum())

# Renaming columns:


In [None]:
df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

In [None]:
df.head(5)

# Outlier detection:


In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=df["age"])
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=df["resting_blood_pressure"])
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=df["cholesterol"])
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=df["max_heart_rate_achieved"])
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=df["st_depression"])
plt.show()

In [None]:
df.target.value_counts()

# Statistical details from the data:


In [None]:
df.describe()

# Univariate Analysis:


In [None]:
sns.countplot(x="target", data=df, palette="bwr")
plt.show()

In [None]:
plt.figure( figsize= (25,25) )
df[['age'] ].hist(bins=10)
plt.show()

In [None]:
plt.figure( figsize= (25,25) )
df[['resting_blood_pressure'] ].hist(bins=10)
plt.show()

In [None]:
plt.figure( figsize= (25,25) )
df[['cholesterol'] ].hist(bins=10)
plt.show()

In [None]:
plt.figure( figsize= (25,25) )
df[['max_heart_rate_achieved'] ].hist(bins=10)
plt.show()

In [None]:
plt.figure( figsize= (25,25) )
df[['st_depression'] ].hist(bins=10)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.countplot(x="chest_pain_type", data=df)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(10,6))
x = df['max_heart_rate_achieved']
ax = sns.distplot(x, bins=10)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.countplot(x="rest_ecg", data=df)
plt.show()

# Bivariate Analysis:


In [None]:
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.countplot(x="chest_pain_type", hue="target", data=df)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.stripplot(x="target", y="max_heart_rate_achieved", data=df, jitter = 0.01)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.stripplot(x="target", y="age", data=df, jitter = 0.01)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.regplot(x="age", y="resting_blood_pressure", data=df)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.regplot(x="age", y="cholesterol", data=df)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.regplot(x="cholesterol", y="max_heart_rate_achieved", data=df)
plt.show()

In [None]:
pd.crosstab(df.age,df.target).plot(kind="bar",figsize=(20,6))
plt.title('Heart Disease Frequency for Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('heartDiseaseAndAges.png')
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.countplot(x="sex", hue="target", data=df)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.countplot(x="fasting_blood_sugar", hue="target", data=df)
plt.show()

In [None]:
pd.crosstab(df.st_depression,df.target).plot(kind="bar",figsize=(20,6))
plt.title('Heart Disease Frequency for ST depression')
plt.xlabel('ST_Depression')
plt.ylabel('Frequency')
plt.savefig('heartDiseaseAndSTdepression.png')
plt.show()

In [None]:
pd.crosstab(df.resting_blood_pressure,df.target).plot(kind="bar",figsize=(20,6))
plt.title('Heart Disease Frequency for Resting Blood Pressure')
plt.xlabel('Resting Blood Pressure')
plt.ylabel('Frequency')
plt.savefig('heartDiseaseAndRestingbps.png')
plt.show()

# Multivariate Analysis:


In [None]:
num_var = ['age', 'resting_blood_pressure', 'cholesterol', 'max_heart_rate_achieved', 'st_depression', 'target' ]
sns.pairplot(df[num_var], kind='scatter', diag_kind='hist')
plt.show()

# Correlation Matrix:


In [None]:
plt.figure(figsize=(16,12))
plt.title('Correlation Heatmap of Heart Disease Dataset')
a = sns.heatmap(df.corr(), square=True, annot=True, fmt='.2f', linecolor='white')
a.set_xticklabels(a.get_xticklabels(), rotation=90)
a.set_yticklabels(a.get_yticklabels(), rotation=30)
plt.show()


In [None]:
corr_matrix = df.corr()
corr_matrix['target'].sort_values( ascending = False )

# Data split

In [None]:
y = data["target"]
X = data.drop('target',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

In [None]:
print(y_test.unique())
Counter(y_train)

# Standardization

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Random Forest-
It is an ensemble learning method that operates by constructing a multitude of decision trees at training time and outputs the class that is the mode of the classes or classification or mean prediction(regression) of the individual trees.


In [None]:
m3 = 'Random Forest Classfier'
rf = RandomForestClassifier(n_estimators=20, random_state=12,max_depth=5)
rf.fit(X_train,y_train)
rf_predicted = rf.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, rf_predicted)
rf_acc_score = accuracy_score(y_test, rf_predicted)
print("confussion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest:",rf_acc_score*100,'\n')
print(classification_report(y_test,rf_predicted))

# Feature importance

In [None]:
imp_feature = pd.DataFrame({'Feature': ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'], 'Importance': rf.feature_importances_})
plt.figure(figsize=(10,4))
plt.title("barplot Represent feature importance ")
plt.xlabel("importance ")
plt.ylabel("features")
plt.barh(imp_feature['Feature'],imp_feature['Importance'], color = 'blue')
plt.show()

# Decision Tree-
Decision Trees predict the value of a target variable by learning simple decision rules inferred from the input data features. The decision tree algorithm builds the classification model in the form of a tree structure. It utilizes the if-then rules which are equally exhaustive and mutually exclusive in classification.


In [None]:
m6 = 'DecisionTreeClassifier'
dt = DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 6)
dt.fit(X_train, y_train)
dt_predicted = dt.predict(X_test)
dt_conf_matrix = confusion_matrix(y_test, dt_predicted)
dt_acc_score = accuracy_score(y_test, dt_predicted)
print("confussion matrix")
print(dt_conf_matrix)
print("\n")
print("Accuracy of DecisionTreeClassifier:",dt_acc_score*100,'\n')
print(classification_report(y_test,dt_predicted))

# Feature importance

In [None]:
imp_feature = pd.DataFrame({'Feature': ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'], 'Importance': dt.feature_importances_})
plt.figure(figsize=(10,4))
plt.title("barplot Represent feature importance ")
plt.xlabel("importance ")
plt.ylabel("features")
plt.barh(imp_feature['Feature'],imp_feature['Importance'],color = 'blue')
plt.show()

# Xgboost-
XGBoost is a popularly used algorithm which seeks to push the limit of computational resources used in conventional boosted trees. XGBoost is short for “Extreme Gradient Boosting” and is based on the original gradient boosting model
The key difference between the two is that XGBoost uses a more regularized model formalization to control over-fitting, giving it better performance.XGBoost uses a combination of bagging and boosting which can reduce variance and bias respectively.

In [None]:
m4 = 'Extreme Gradient Boost'
xgb = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=10,gamma=0.6, subsample=0.52,colsample_bytree=0.6,seed=28,  
                    reg_lambda=2, booster='dart', colsample_bylevel=0.6, colsample_bynode=0.5)
xgb.fit(X_train, y_train)
xgb_predicted = xgb.predict(X_test)
xgb_conf_matrix = confusion_matrix(y_test, xgb_predicted)
xgb_acc_score = accuracy_score(y_test, xgb_predicted)
print("confussion matrix")
print(xgb_conf_matrix)
print("\n")
print("Accuracy of Extreme Gradient Boost:",xgb_acc_score*100,'\n')
print(classification_report(y_test,xgb_predicted))

# Feature importance

In [None]:
imp_feature = pd.DataFrame({'Feature': ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'], 'Importance': xgb.feature_importances_})
plt.figure(figsize=(10,4))
plt.title("barplot Represent feature importance ")
plt.xlabel("importance ")
plt.ylabel("features")
plt.barh(imp_feature['Feature'],imp_feature['Importance'],color = 'blue')
plt.show()

# Logistic Regression-
It is a classification algorithm in machine learning that uses one or more independent variables to determine an outcome. The outcome is measured with a dichotomous variable meaning it will have only two possible outcomes.


In [None]:
m1 = 'Logistic Regression'
lr = LogisticRegression()
model = lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print("confussion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(y_test,lr_predict))

# Naive Bayes-
It is a classification algorithm based on Bayes’s theorem which gives an assumption of independence among predictors. In simple terms, a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature.


In [None]:
m2 = 'Naive Bayes'
nb = GaussianNB()
nb.fit(X_train,y_train)
nbpred = nb.predict(X_test)
nb_conf_matrix = confusion_matrix(y_test, nbpred)
nb_acc_score = accuracy_score(y_test, nbpred)
print("confussion matrix")
print(nb_conf_matrix)
print("\n")
print("Accuracy of Naive Bayes model:",nb_acc_score*100,'\n')
print(classification_report(y_test,nbpred))

# KNN-
It is a lazy learning algorithm that stores all instances corresponding to training data in n-dimensional space. It is a lazy learning algorithm as it does not focus on constructing a general internal model, instead, it works on storing instances of training data.

In [None]:
m5 = 'K-NeighborsClassifier'
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
knn_predicted = knn.predict(X_test)
knn_conf_matrix = confusion_matrix(y_test, knn_predicted)
knn_acc_score = accuracy_score(y_test, knn_predicted)
print("confussion matrix")
print(knn_conf_matrix)
print("\n")
print("Accuracy of K-NeighborsClassifier:",knn_acc_score*100,'\n')
print(classification_report(y_test,knn_predicted))

# SVM-
It is a lazy learning algorithm that stores all instances corresponding to training data in n-dimensional space. It is a lazy learning algorithm as it does not focus on constructing a general internal model, instead, it works on storing instances of training data.


In [None]:
from sklearn.svm import SVC
m7 = 'Support Vector Classifier'
svc =  SVC(kernel='rbf', C=2)
svc.fit(X_train, y_train)
svc_predicted = svc.predict(X_test)
svc_conf_matrix = confusion_matrix(y_test, svc_predicted)
svc_acc_score = accuracy_score(y_test, svc_predicted)
print("confussion matrix")
print(svc_conf_matrix)
print("\n")
print("Accuracy of Support Vector Classifier:",svc_acc_score*100,'\n')
print(classification_report(y_test,svc_predicted))

# Model Comparison

In [None]:
model_ev = pd.DataFrame({'Model': ['Logistic Regression','Naive Bayes','Random Forest','Extreme Gradient Boost',
                    'K-Nearest Neighbour','Decision Tree','Support Vector Machine'], 'Accuracy': [lr_acc_score*100,
                    nb_acc_score*100,rf_acc_score*100,xgb_acc_score*100,knn_acc_score*100,dt_acc_score*100,svc_acc_score*100]})
model_ev

In [None]:
from mlxtend.classifier import StackingCVClassifier

# Ensemble technique-Model Stacking-
Stacking, also known as stacked generalization, is an ensemble method where the models are combined using another machine learning algorithm. The basic idea is to train machine learning algorithms with training dataset and then generate a new dataset with these models. Then this new dataset is used as input for the combiner machine learning algorithm.

# Random forest + KNN + SVM


In [None]:
scv=StackingCVClassifier(classifiers=[rf,knn,svc],meta_classifier= knn,random_state=42)
scv.fit(X_train,y_train)
scv_predicted = scv.predict(X_test)
scv_conf_matrix = confusion_matrix(y_test, scv_predicted)
scv_acc_score = accuracy_score(y_test, scv_predicted)
print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print(classification_report(y_test,scv_predicted))

# Xgboost + KNN + SVM

In [None]:
scv=StackingCVClassifier(classifiers=[xgb,knn,svc],meta_classifier= xgb,random_state=42)
scv.fit(X_train,y_train)
scv_predicted = scv.predict(X_test)
scv_conf_matrix = confusion_matrix(y_test, scv_predicted)
scv_acc_score = accuracy_score(y_test, scv_predicted)
print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print(classification_report(y_test,scv_predicted))

# Random forest + XGB + SVM

In [None]:
scv=StackingCVClassifier(classifiers=[rf,xgb,svc],meta_classifier= knn,random_state=42)
scv.fit(X_train,y_train)
scv_predicted = scv.predict(X_test)
scv_conf_matrix = confusion_matrix(y_test, scv_predicted)
scv_acc_score = accuracy_score(y_test, scv_predicted)
print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print(classification_report(y_test,scv_predicted))

# Xgboost + KNN + SVM

In [None]:
scv=StackingCVClassifier(classifiers=[xgb,knn,svc],meta_classifier= xgb,random_state=42)
scv.fit(X_train,y_train)
scv_predicted = scv.predict(X_test)
scv_conf_matrix = confusion_matrix(y_test, scv_predicted)
scv_acc_score = accuracy_score(y_test, scv_predicted)
print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print(classification_report(y_test,scv_predicted))

# Random forest + KNN + Xgboost

In [None]:
scv=StackingCVClassifier(classifiers=[rf,knn,xgb],meta_classifier= xgb,random_state=42)
scv.fit(X_train,y_train)
scv_predicted = scv.predict(X_test)
scv_conf_matrix = confusion_matrix(y_test, scv_predicted)
scv_acc_score = accuracy_score(y_test, scv_predicted)
print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print(classification_report(y_test,scv_predicted))

# GridSearchCV

In [None]:
model = XGBClassifier()
model.fit(X_train, y_train)
  
# print prediction results
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
from sklearn.model_selection import GridSearchCV
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(XGBClassifier(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

In [None]:
# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

In [None]:
grid_predictions = grid.predict(X_test)
  
# print classification report
print(classification_report(y_test, grid_predictions))

# ROC curve

In [None]:
import plotly.express as px
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=500, random_state=0)

model = XGBClassifier()
model.fit(X, y)
y_score = model.predict_proba(X)[:, 1]
fpr, tpr, thresholds = roc_curve(y, y_score)

# The histogram of scores compared to true labels
fig_hist = px.histogram(
    x=y_score, color=y, nbins=50,
    labels=dict(color='True Labels', x='Score')
)

fig_hist.show()


# Evaluating model performance at various thresholds
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

# PR curve for Xgboost

In [None]:
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, auc
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=500, random_state=0)

model = XGBClassifier()
model.fit(X, y)
y_score = model.predict_proba(X)[:, 1]

precision, recall, thresholds = precision_recall_curve(y, y_score)

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=1, y1=0
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.show()

# PR curve for SVM

# PR curve for Random Forest

In [None]:
X, y = make_classification(n_samples=500, random_state=0)

model = RandomForestClassifier(n_estimators=20, random_state=12,max_depth=5)
model.fit(X, y)
y_score = model.predict_proba(X)[:, 1]

precision, recall, thresholds = precision_recall_curve(y, y_score)

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=1, y1=0
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.show()

# PR curve for KNN

In [None]:
X, y = make_classification(n_samples=500, random_state=0)

model = KNeighborsClassifier(n_neighbors=10)
model.fit(X, y)
y_score = model.predict_proba(X)[:, 1]

precision, recall, thresholds = precision_recall_curve(y, y_score)

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=1, y1=0
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.show()

# PR curve for Stacking- Random forest, KNN, SVM

In [None]:
X, y = make_classification(n_samples=500, random_state=0)

model = StackingCVClassifier(classifiers=[rf,knn,svc],meta_classifier= xgb,random_state=42)
model.fit(X, y)
y_score = model.predict_proba(X)[:, 1]

precision, recall, thresholds = precision_recall_curve(y, y_score)

fig = px.area(
    x=recall, y=precision,
    title=f'Precision-Recall Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='Recall', y='Precision'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=1, y1=0
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')

fig.show()