In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.dtypes

In [None]:
data['sex'][data['sex'] == 0] = 'female'
data['sex'][data['sex'] == 1] = 'male'

data['cp'][data['cp'] == 1] = 'typical angina'
data['cp'][data['cp'] == 2] = 'atypical angina'
data['cp'][data['cp'] == 3] = 'non-anginal pain'
data['cp'][data['cp'] == 4] = 'asymptomatic'

data['fbs'][data['fbs'] == 0] = 'lower than 120mg/ml'
data['fbs'][data['fbs'] == 1] = 'greater than 120mg/ml'

data['restecg'][data['restecg'] == 0] = 'normal'
data['restecg'][data['restecg'] == 1] = 'ST-T wave abnormality'
data['restecg'][data['restecg'] == 2] = 'left ventricular hypertrophy'

data['exang'][data['exang'] == 0] = 'no'
data['exang'][data['exang'] == 1] = 'yes'

data['slope'][data['slope'] == 1] = 'upsloping'
data['slope'][data['slope'] == 2] = 'flat'
data['slope'][data['slope'] == 3] = 'downsloping'

data['thal'][data['thal'] == 1] = 'normal'
data['thal'][data['thal'] == 2] = 'fixed defect'
data['thal'][data['thal'] == 3] = 'reversable defect'

In [None]:
data['sex'] = data['sex'].astype('object')
data['cp'] = data['cp'].astype('object')
data['fbs'] = data['fbs'].astype('object')
data['restecg'] = data['restecg'].astype('object')
data['exang'] = data['exang'].astype('object')
data['slope'] = data['slope'].astype('object')
data['thal'] = data['thal'].astype('object')

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
sns.countplot('target', data=data)

In [None]:
sns.countplot('target', data=data, hue='sex', palette="Set1")

In [None]:
data[['target', 'sex']].groupby(['sex'], as_index=False).mean().sort_values(by='sex', ascending=False)

In [None]:
data['age'].hist()

In [None]:
sns.distplot(data['age'], color = 'red')

In [None]:
plt.figure(figsize=(20,10))
sns.countplot('age', hue='target', data=data)

In [None]:
sns.swarmplot('target', 'age', data=data)

In [None]:
sns.swarmplot('target', 'chol', data=data)

In [None]:
sns.countplot('target', hue='ca', data=data)

In [None]:
data.groupby(['target','ca']).size().unstack().plot(kind='bar', stacked=True, figsize=(10,8))
plt.show()

In [None]:
sns.countplot('target', hue='thal', data=data)

In [None]:
plt.figure( figsize=(20,8))
plt.scatter(x = data['target'], y = data['chol'], s = data['thalach']*100, color = 'red')

In [None]:
label = data['target']

In [None]:
label.unique()

In [None]:
label.value_counts()

In [None]:
data=data.drop(['target'], axis=1)

In [None]:
data.head()

In [None]:
label.shape

In [None]:
data = pd.get_dummies(data, drop_first=True)

In [None]:
data.head()

In [None]:
x = data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size = 0.2)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
mod1 = RandomForestClassifier()
mod1.fit(x_train, y_train)

In [None]:
mod2 = DecisionTreeClassifier()
mod2.fit(x_train, y_train)

In [None]:
pred_1 = mod1.predict(x_test)
pred_quant1 = mod1.predict_proba(x_test)[:, 1]
pred1 = mod1.predict(x_test)

pred_2 = mod2.predict(x_test)
pred_quant2 = mod2.predict_proba(x_test)[:, 1]
pred2 = mod2.predict(x_test)


In [None]:
score1_train=mod1.score(x_train, y_train)
print(f'Training Random Forest: {round(score1_train*100,2)}%')

score1_test=mod1.score(x_test,y_test)
print(f'Testing Random Forest: {round(score1_test*100,2)}%')

In [None]:
score2_train=mod2.score(x_train, y_train)
print(f'Training Decision Tree: {round(score2_train*100,2)}%')

score2_test=mod2.score(x_test,y_test)
print(f'Testing Decision Tree: {round(score2_test*100,2)}%')

In [None]:
from sklearn.metrics  import confusion_matrix

In [None]:
confusion_matrix(y_test, pred1)

In [None]:
sns.heatmap(confusion_matrix(y_test, pred1), annot=True)

In [None]:
confusion_matrix(y_test, pred2)

In [None]:
sns.heatmap(confusion_matrix(y_test, pred2), annot=True)

In [None]:
y_pred_quant1 = mod1.predict_proba(x_test)[:, 1]

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant1)

fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="-", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])

plt.rcParams['figure.figsize'] = (15, 5)
plt.title('ROC curve for diabetes classifier', fontweight = 30)
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

In [None]:
import eli5 
from eli5.sklearn import PermutationImportance
perm1 = PermutationImportance(mod1, random_state = 0).fit(x_test, y_test)
eli5.show_weights(perm1, feature_names = x_test.columns.tolist())

In [None]:
perm2 = PermutationImportance(mod2, random_state = 0).fit(x_test, y_test)
eli5.show_weights(perm2, feature_names = x_test.columns.tolist())