In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## DATASET COLUMNS 
1. Age (age in years)
2. Sex (1 = male; 0 = female)
3. CP (chest pain type)
4. TRESTBPS (resting blood pressure (in mm Hg on admission to the hospital))
5. CHOL (serum cholestoral in mg/dl)
6. FPS (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. RESTECH (resting electrocardiographic results)
8. THALACH (maximum heart rate achieved)
9. EXANG (exercise induced angina (1 = yes; 0 = no))
10. OLDPEAK (ST depression induced by exercise relative to rest)
11. SLOPE (the slope of the peak exercise ST segment)
12. CA (number of major vessels (0-3) colored by flourosopy)
13. THAL (3 = normal; 6 = fixed defect; 7 = reversable defect)
14. TARGET (1 or 0)

# Import necessary Python modules and Read the data

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
Heart_Disease =pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
Heart_Disease.head()

# Exploratory Data Analysis (EDA)

In [None]:
Heart_Disease.describe()

In [None]:
Heart_Disease.info()

In [None]:
Heart_Disease.columns

In [None]:
Heart_Disease.shape 

In [None]:
Heart_Disease.isnull().sum()

In [None]:
Heart_Disease.isnull().values.any()

In [None]:
#checking for Outlier's
sns.boxplot(x=Heart_Disease)

In [None]:
#Discover outliers with mathematical function
#Z-Score
#if the Z-score value is greater than or less than 3 or -3 respectively, that data point will be identified as outliers.
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(Heart_Disease))
print(z)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(Heart_Disease.corr(),annot=True,fmt='.1f')
plt.show()

In [None]:
sns.pairplot(Heart_Disease)
plt.show()

In [None]:
sns.barplot(x=Heart_Disease.age.value_counts()[:10].index,y=Heart_Disease.age.value_counts()[:10].values)
plt.xlabel('Age')
plt.ylabel('Age Counter')
plt.title('Age Analysis')
plt.show()

In [None]:
minAge=min(Heart_Disease.age)
maxAge=max(Heart_Disease.age)
meanAge=Heart_Disease.age.mean()
print('Min Age :',minAge)
print('Max Age :',maxAge)

In [None]:
young_ages=Heart_Disease[(Heart_Disease.age>=29)&(Heart_Disease.age<40)]
middle_ages=Heart_Disease[(Heart_Disease.age>=40)&(Heart_Disease.age<55)]
elderly_ages=Heart_Disease[(Heart_Disease.age>55)]
print('Young Ages :',len(young_ages))
print('Middle Ages :',len(middle_ages))
print('Elderly Ages :',len(elderly_ages))


In [None]:
sns.barplot(x=['young ages','middle ages','elderly ages'],y=[len(young_ages),len(middle_ages),len(elderly_ages)])
plt.xlabel('Age Range')
plt.ylabel('Age Counts')
plt.title('Ages State in Dataset')
plt.show()

In [None]:
colors = ['blue','green','yellow']
explode = [0,0,0.1]
plt.figure(figsize = (5,5))
#plt.pie([target_0_agerang_0,target_1_agerang_0], explode=explode, labels=['Target 0 Age Range 0','Target 1 Age Range 0'], colors=colors, autopct='%1.1f%%')
plt.pie([len(young_ages),len(middle_ages),len(elderly_ages)],labels=['young ages','middle ages','elderly ages'],explode=explode,colors=colors, autopct='%1.1f%%')
plt.title('Age States',color = 'blue',fontsize = 15)
plt.show()

In [None]:
#Sex (1 = male; 0 = female)
sns.countplot(Heart_Disease.sex)
plt.show()

In [None]:
sns.countplot(Heart_Disease.cp)
plt.xlabel('Chest Type')
plt.ylabel('Count')
plt.title('Chest Type vs Count State')
plt.show()
#0 status at least
#1 condition slightly distressed
#2 condition medium problem
#3 condition too bad

In [None]:
# Show the results of a linear regression within each dataset
sns.lmplot(x="trestbps", y="chol",data=Heart_Disease,hue="cp")
plt.show()

In [None]:
sns.barplot(x=Heart_Disease.thalach.value_counts()[:20].index,y=Heart_Disease.thalach.value_counts()[:20].values)
plt.xlabel('Thalach')
plt.ylabel('Count')
plt.title('Thalach Counts')
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.countplot(Heart_Disease.thal)
plt.show()

In [None]:
#Let's see the correlation values between them
Heart_Disease.corr()

# Splitting Data into train and test with 70% and 20% respectively 


In [None]:
X=Heart_Disease.drop(['target','slope'],axis=1)
#removing 'slope' to reduce the strong negative multicollinearity between 'slope' and 'oldpeak'
Y=Heart_Disease['target']

# All Classification Algorithms with Default Parameters


In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
logit = sm.Logit(y_train, X_train).fit()
print(logit.summary())
# attributes with p value less than 0.05 are statistically significant

In [None]:
X=X.drop(['restecg','fbs','chol','trestbps','age'],axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
logit = sm.Logit(y_train, X_train).fit()
print(logit.summary())

In [None]:
y_pred = logit.predict(X_test)
prediction = list(map(round, y_pred)) 

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
cm1= confusion_matrix(y_test,prediction)
print('Confusion Matrix : ')
print(cm1)
from sklearn.metrics import accuracy_score
print ("Accuracy Score : ", accuracy_score(y_test, prediction))
print ('Report : ')
print (classification_report(y_test, prediction))

In [None]:
#Sensitivity and Specificity
#A test with a sensitivity and specificity of around 90% would be considered to have good diagnostic performance)

sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Sensitivity : ', sensitivity )

specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Specificity : ', specificity)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

# 1. Logistic Regression Algorithm


In [None]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve, auc
acclog = accuracy_score(y_test, y_pred)*100
reclog = recall_score(y_test, y_pred)*100
preclog = precision_score(y_test, y_pred)*100
fprlog, tprlog, _ = roc_curve(y_test, y_pred)
auclog=auc(fprlog, tprlog)*100

In [None]:
from sklearn.metrics import confusion_matrix
cm1= confusion_matrix(y_test,y_pred)
print('Confusion Matrix :' )
print(cm1)
from sklearn.metrics import accuracy_score
print ("Accuracy Score: ", accuracy_score(y_test, y_pred))
print ('Report : ')
print (classification_report(y_test, y_pred))

In [None]:
#Sensitivity and Specificity
#A test with a sensitivity and specificity of around 90% would be considered to have good diagnostic performance)

sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Sensitivity : ', sensitivity )

specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Specificity : ', specificity)

In [None]:
yl = model.predict_proba(X_test)

# 2. K Nearest Neighbor Algorithm


In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')
knn.fit(X_train, y_train)

# Predicting the Test set results
y_pred = knn.predict(X_test)

yk = knn.predict_proba(X_test)

from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve, auc
accknn = accuracy_score(y_test, y_pred)*100
recknn = recall_score(y_test, y_pred)*100
precknn = precision_score(y_test, y_pred)*100
fprknn, tprknn, _ = roc_curve(y_test, y_pred)
aucknn=auc(fprknn, tprknn)*100

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm2 = confusion_matrix(y_test, y_pred)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

results = confusion_matrix(y_test, y_pred)
print ('Confusion Matrix :')
print(results)
print ('Accuracy Score :',accuracy_score(y_test, y_pred))
print ('Report : ')
print (classification_report(y_test, y_pred))

In [None]:
sensitivity = cm2[0,0]/(cm2[0,0]+cm2[0,1])
print('Sensitivity : ', sensitivity )

specificity = cm2[1,1]/(cm2[1,0]+cm2[1,1])
print('Specificity : ', specificity)

# 3.Support Vector Machine Algorithm


In [None]:
# Fitting Kernel SVM to the Training set
from sklearn.svm import SVC
svm = SVC(kernel = 'rbf', random_state = 0, probability=True)
svm.fit(X_train, y_train)

# Predicting the Test set results
y_pred = svm.predict(X_test)

ys = svm.predict_proba(X_test)

from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve, auc
accsvm = accuracy_score(y_test, y_pred)*100
recsvm = recall_score(y_test, y_pred)*100
precsvm = precision_score(y_test, y_pred)*100
fprsvm, tprsvm, _ = roc_curve(y_test, y_pred)
aucsvm=auc(fprsvm, tprsvm)*100

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm3 = confusion_matrix(y_test, y_pred)

results = confusion_matrix(y_test, y_pred)
print ('Confusion Matrix :')
print(results)
print ('Accuracy Score :',accuracy_score(y_test, y_pred))
print ('Report : ')
print (classification_report(y_test, y_pred))

In [None]:
sensitivity = cm3[0,0]/(cm3[0,0]+cm3[0,1])
print('Sensitivity : ', sensitivity )

specificity = cm3[1,1]/(cm3[1,0]+cm3[1,1])
print('Specificity : ', specificity)

# 4. Gaussian Naive Bayes Algorithm


In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predicting the Test set results
y_pred = gnb.predict(X_test)

yg = gnb.predict_proba(X_test)

from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve, auc
accgnb = accuracy_score(y_test, y_pred)*100
recgnb = recall_score(y_test, y_pred)*100
precgnb = precision_score(y_test, y_pred)*100
fprgnb, tprgnb, _ = roc_curve(y_test, y_pred)
aucgnb=auc(fprgnb, tprgnb)*100

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm4 = confusion_matrix(y_test, y_pred)

results = confusion_matrix(y_test, y_pred)
print ('Confusion Matrix :')
print(results)
print ('Accuracy Score :',accuracy_score(y_test, y_pred))
print ('Report : ')
print (classification_report(y_test, y_pred))

In [None]:
sensitivity = cm4[0,0]/(cm4[0,0]+cm4[0,1])
print('Sensitivity : ', sensitivity )

specificity = cm4[1,1]/(cm4[1,0]+cm4[1,1])
print('Specificity : ', specificity)

# Comparison of all the Machine Learning Algorithms by Comparing some Evaluation Metrics


In [None]:
algos=["Logistic Regression","K Nearest Neighbor","Support Vector Machine","Gaussian Naive Bayes"]
acc=[acclog,accknn,accsvm,accgnb]
auc=[auclog,aucknn,aucsvm,aucgnb]
recall=[reclog,recknn,recsvm,recgnb]
prec=[preclog,precknn,precsvm,precgnb]
comp={"Algorithms":algos,"Accuracies":acc,"Area Under the Curve":auc,"Recall":recall,"Precision":prec}
compdf=pd.DataFrame(comp)
display(compdf)
#display(compdf.sort_values(by=["Accuracies","Area Under the Curve","Recall","Precision"], ascending=False))

# ROC of all the Machine Learning Algorithms on default parameters

In [None]:
import sklearn.metrics as metrics
roc_auc1=metrics.auc(fprlog,tprlog)
roc_auc2=metrics.auc(fprknn,tprknn)
roc_auc3=metrics.auc(fprsvm,tprsvm)
roc_auc4=metrics.auc(fprgnb,tprgnb)

import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(20,10))
plt.title("Performance Comparison of Classification Models (ROC Curve)", fontsize=25)
plt.plot(fprlog,tprlog,"b",label="AUC of Logistic Regression = %0.2f" % roc_auc1)
plt.plot(fprknn,tprknn,"g",label="AUC of K Nearest Neighbor = %0.2f" % roc_auc2)
plt.plot(fprsvm,tprsvm,"m",label="AUC of Support Vector Machine = %0.2f" % roc_auc3)
plt.plot(fprgnb,tprgnb,"c",label="AUC of Gaussian Naive Bayes = %0.2f" % roc_auc4)
plt.rcParams.update({'font.size': 16})
plt.legend(loc="lower right")
plt.plot([0, 1],[0, 1],"r--")
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel("True Positive Rate", fontsize = 18)
plt.xlabel("False Positive Rate", fontsize = 18)

plt.rc('axes', labelsize=15)
plt.rc('axes', titlesize=22)

# Cross Validation Score 

In [None]:
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
accuracy_log1 = cross_val_score(model, X, Y, scoring='accuracy', cv = 10)
#print('CVS for log1 : ', accuracy_svc)
print("Accuracy of LOG with Cross Validation is:",accuracy_log1.mean() * 100)
#accuracy_log = cross_val_score(log, X, Y, cv = 10)
#print('CVS for LOG : ', accuracy_svc)
#print("Accuracy of LOG with Cross Validation is:",accuracy_log.mean() * 100)
accuracy_svc = cross_val_score(svm, X, Y, cv = 10)
#print('CVS for SVC : ', accuracy_svc)
print("Accuracy of SVC with Cross Validation is:",accuracy_svc.mean() * 100)
accuracy_gnb = cross_val_score(gnb, X, Y, scoring='accuracy', cv = 10)
#print('CVS for GNB : ', accuracy_gnb)
print("Accuracy of GNB with Cross Validation is:",accuracy_gnb.mean() * 100)
accuracy_knn = cross_val_score(knn, X, Y, scoring='accuracy', cv = 10)
#print('CVS for knn : ', accuracy_gnb)
print("Accuracy of KNN with Cross Validation is:",accuracy_knn.mean() * 100)

In [None]:
algos=["Logistic Regression","K Nearest Neighbor","Support Vector Machine","Gaussian Naive Bayes"]
acc1=[acclog,accknn,accsvm,accgnb]
acc2=[accuracy_log1.mean() * 100, accuracy_knn.mean() * 100, accuracy_svc.mean() * 100, accuracy_gnb.mean() * 100]
comp={"Algorithms":algos,"Accuracies without Cross Validation":acc1,"Accuracies with Cross Validation":acc2}
compdf=pd.DataFrame(comp)
display(compdf)