In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score,roc_curve,confusion_matrix,accuracy_score

In [None]:
df=pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
df.head()

**The dataset contains:**
1. age(in years)
2. sex: (1 = male; 0 = female)
3. cp: chest pain type
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5. chol: serum cholestoral in mg/dl
6. fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. restecg: resting electrocardiographic results
8. thalach: maximum heart rate achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak: ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
12. ca: number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
14. target: 1 or 0 

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().any()

In [None]:
plt.figure(figsize=(7,5))
sns.countplot('target',data=df,palette='viridis')
df['target'].value_counts()

# **EDA**

In [None]:
plt.figure(figsize=(14,6))
sns.countplot('age',hue='target',data=df,palette='viridis')

In [None]:
plt.figure(figsize=(14,6))
sns.heatmap(df.corr(),annot=True,linecolor='black',linewidths=0.01)

In [None]:
fig, axis=plt.subplots(1,2,figsize=(14,6))
_=sns.countplot('sex',hue='target',data=df,palette='viridis',ax=axis[0])
_=df['sex'].value_counts().plot.pie(ax=axis[1],autopct='%.2f%%')

In [None]:
fig, axis=plt.subplots(1,2,figsize=(14,6))
_=sns.countplot('cp',hue='target',data=df,palette='viridis',ax=axis[0])
_=df['cp'].value_counts().plot.pie(ax=axis[1],autopct='%.2f%%')

In [None]:
fig, axis=plt.subplots(1,2,figsize=(14,6))
_=sns.countplot('fbs',hue='target',data=df,palette='viridis',ax=axis[0])
_=df['fbs'].value_counts().plot.pie(ax=axis[1],autopct='%.2f%%')

In [None]:
explode=[0.03,0.03,0.03]
fig, axis=plt.subplots(1,2,figsize=(14,7))
_=sns.countplot('restecg',hue='target',data=df,palette='viridis',ax=axis[0])
_=df['restecg'].value_counts().plot.pie(ax=axis[1],autopct='%.2f%%',explode=explode)

In [None]:
fig, axis=plt.subplots(1,2,figsize=(14,7))
_=sns.countplot('exang',hue='target',data=df,palette='viridis',ax=axis[0])
_=df['exang'].value_counts().plot.pie(ax=axis[1],autopct='%.2f%%')

In [None]:
fig, axis=plt.subplots(1,2,figsize=(14,7))
_=sns.countplot('slope',hue='target',data=df,palette='viridis',ax=axis[0])
_=df['slope'].value_counts().plot.pie(ax=axis[1],autopct='%.2f%%',explode=explode)

In [None]:
fig, axis=plt.subplots(1,2,figsize=(14,7))
_=sns.countplot('ca',hue='target',data=df,palette='viridis',ax=axis[0])
_=df['ca'].value_counts().plot.pie(ax=axis[1],autopct='%.2f%%')

In [None]:
fig, axis=plt.subplots(1,2,figsize=(14,7))
_=sns.countplot('thal',hue='target',data=df,palette='viridis',ax=axis[0])
_=df['thal'].value_counts().plot.pie(ax=axis[1],autopct='%.2f%%')

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df.age[df['target']==0],df.thalach[df['target']==0])
plt.scatter(df.age[df['target']==1],df.thalach[df['target']==1])

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='chol',y='thalach',data=df,hue='target')

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='trestbps',y='thalach',data=df,hue='target')

# Creating Dummy variables

In [None]:
data=pd.get_dummies(df, columns=['sex','cp','fbs','restecg','exang','slope','ca','thal'])

# Scaling the data

In [None]:
scaler=MinMaxScaler()
data_scaled=scaler.fit_transform(data)
data_scaled=pd.DataFrame(data_scaled, columns=data.columns)

In [None]:
X=data_scaled.drop('target',axis=1)
y=data_scaled['target']

# Splitting the data

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

# K-Nearest Neighbors 

Applying GridSearch for hyperparameter tuning

In [None]:
knn=KNeighborsClassifier()
params={'n_neighbors':range(1,21),'weights':['uniform','distance'],'leaf_size':range(1,21),'p':[1,2,3,4,5,6,7,8,9,10]}

In [None]:
gs_knn=GridSearchCV(knn,param_grid=params,n_jobs=-1)

In [None]:
gs_knn.fit(X_train,y_train)
gs_knn.best_params_

In [None]:
prediction=gs_knn.predict(X_test)

In [None]:
acc_knn=accuracy_score(y_test,prediction)
print(acc_knn)
cm=confusion_matrix(y_test,prediction)

In [None]:
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

sns.heatmap(pd.DataFrame(cm),annot=True,cmap='GnBu',fmt='g')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for K-Nearest Neighbors Model',y=1.1)

In [None]:
probability=gs_knn.predict_proba(X_test)[:,1]

In [None]:
fpr_knn,tpr_knn,thresh=roc_curve(y_test,probability)

In [None]:
plt.figure(figsize=(10,6))
plt.title('Receiver Operating Characteristic Curve')
plt.plot(fpr_knn,tpr_knn)
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='0.5')
plt.plot([1,1],c='0.5')

In [None]:
roc_auc_score(y_test,probability)

# Logistic Regression

In [None]:
lr=LogisticRegression()
params={'penalty':['l1','l2'],'C':[0.01,0.1,1,10,100],'class_weight':['balanced',None]}

In [None]:
gs_r=GridSearchCV(lr,param_grid=params,n_jobs=-1)

In [None]:
gs_r.fit(X_train,y_train)
gs_r.best_params_

In [None]:
prediction=gs_r.predict(X_test)

In [None]:
acc_lr=accuracy_score(y_test,prediction)
print(acc_lr)
cm=confusion_matrix(y_test,prediction)

In [None]:
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

sns.heatmap(pd.DataFrame(cm),annot=True,cmap='GnBu',fmt='g')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for Logistic Regression',y=1.1)

In [None]:
probability=gs_r.predict_proba(X_test)[:,1]

In [None]:
fpr_lr,tpr_lr,thresh=roc_curve(y_test,probability)

In [None]:
plt.figure(figsize=(10,6))
plt.title('Receiver Operating Characteristic Curve')
plt.plot(fpr_lr,tpr_lr)
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='0.5')
plt.plot([1,1],c='0.5')

In [None]:
roc_auc_score(y_test,probability)

# Random Forest Classifier

In [None]:
rfc=RandomForestClassifier()
params={'max_features':['auto','sqrt','log2'],'min_samples_split':[2,3,4,5,6,7,8,9,10],
        'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]}

In [None]:
gs_rfc=GridSearchCV(rfc,param_grid=params,n_jobs=-1)

In [None]:
gs_rfc.fit(X_train,y_train)
gs_rfc.best_params_

In [None]:
prediction=gs_rfc.predict(X_test)

In [None]:
acc_rfc=accuracy_score(y_test,prediction)
print(acc_knn)
cm=confusion_matrix(y_test,prediction)

In [None]:
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

sns.heatmap(pd.DataFrame(cm),annot=True,cmap='GnBu',fmt='g')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for Random Forest Classifier',y=1.1)

In [None]:
probability=gs_rfc.predict_proba(X_test)[:,1]

In [None]:
fpr_rfc,tpr_rfc,thresh=roc_curve(y_test,probability)

In [None]:
plt.figure(figsize=(10,6))
plt.title('Receiver Operating Characteristic Curve')
plt.plot(fpr_rfc,tpr_rfc)
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='0.5')
plt.plot([1,1],c='0.5')

In [None]:
roc_auc_score(y_test,probability)

# Decision Tree CLassifier

In [None]:
tree=DecisionTreeClassifier()
params={'max_features':['auto','sqrt','log2'],'min_samples_split':[2,3,4,5,6,7,8,9,10],
        'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]}

In [None]:
gs_tree=GridSearchCV(tree,param_grid=params,n_jobs=-1)

In [None]:
gs_tree.fit(X_train,y_train)
gs_tree.best_params_

In [None]:
prediction=gs_tree.predict(X_test)

In [None]:
acc_tree=accuracy_score(y_test,prediction)
print(acc_tree)
cm=confusion_matrix(y_test,prediction)

In [None]:
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

sns.heatmap(pd.DataFrame(cm),annot=True,cmap='GnBu',fmt='g')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for Decision Tree Classifier',y=1.1)

In [None]:
probability=gs_tree.predict_proba(X_test)[:,1]

In [None]:
fpr_tree,tpr_tree,thresh=roc_curve(y_test,probability)

In [None]:
plt.figure(figsize=(10,6))
plt.title('Receiver Operating Characteristic Curve')
plt.plot(fpr_tree,tpr_tree)
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='0.5')
plt.plot([1,1],c='0.5')

In [None]:
roc_auc_score(y_test,probability)

In [None]:
result=pd.DataFrame({'Models':['K-Neighbors Classifiers','Logistic Regression','Random Forest Classifier','Decision Tree Classifier'],
                    'Score':[acc_knn,acc_lr,acc_rfc,acc_tree]})
result.sort_values(by='Score',ascending=False)

I am new to Data Science. Please do leave your valueable feedbacks in the comment section. 