In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data=pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data['thal'].value_counts()

In [None]:
data['sex'].value_counts() # 1-male and 0-female

### Infromation of each feature:

age = age in years

sex =(1 = male; 0 = female)

cp = chest pain type

trestbps = resting blood pressure (in mm Hg on admission to the hospital)

chol = serum cholestoral in mg/dl

fbs = (fasting blood sugar &gt; 120 mg/dl) (1 = true; 0 = false)

restecg = resting electrocardiographic results

thalach = maximum heart rate achieved

exang = exercise induced angina (1 = yes; 0 = no)

oldpeak = ST depression induced by exercise relative to rest

slope = the slope of the peak exercise ST segment

ca = number of major vessels (0-3) colored by flourosopy

thal  = 3 = normal; 6 = fixed defect; 7 = reversable defect

target = 1 or 0

In [None]:
data.describe()

In [None]:
data.age.hist(color='red',rwidth=0.5)
plt.xlabel('age')
plt.ylabel('population')

In [None]:
plt.bar(data.target,data.chol)
plt.xticks([0,1])
plt.xlabel('target')
plt.ylabel('Cholosterol')
plt.show()

In [None]:
# Get correlation between target and other features:
# Target has positive correlation with cp(chest pain).i.e positive linear relationship.
# Target and exang has negative relationship. 
corr_matrix=data.corr()
corr_matrix['target'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
# create a scatter matrix to see the relationship between all the below mentioned features against each other
att = ['target','exang','cp','thalach','oldpeak']
scatter_matrix(data[att],figsize=(10,7))
plt.show()

In [None]:
fig,ax = plt.subplots(2,2,figsize=(10,8))
ax[0,0].hist(data[att[0]])
ax[0,0].set_title('target')
ax[0,0].set_xticks([0,1])

ax[0,1].hist(data[att[1]])
ax[0,1].set_title('exang (exercise induced angina)')
ax[0,1].set_xticks([0,1])


ax[1,0].hist(data[att[2]])
ax[1,0].set_title('cp (chest pain level)')
ax[1,0].set_xticks([0,1,2,3])

ax[1,1].hist(data[att[3]])
ax[1,1].set_title('oldpeak')

fig.show()

### Standard Scaler:
##### All data values are skewed. all have different range(left image;before standard scaler)
![](https://i.stack.imgur.com/PZgJ2.png) 
##### Equation for standard scaler::
![](https://media.geeksforgeeks.org/wp-content/uploads/standardisation.jpg) 


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
s = StandardScaler()

In [None]:
new_data=s.fit_transform(data)

In [None]:
new_data[0,:]

# 1) Random Forest Classifier:

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X=data.drop('target',axis=1)
y= data['target']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_test.shape

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [None]:
y_pred= rf.predict(X_test)

In [None]:
 from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score,f1_score

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

In [None]:
f1_score(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import roc_curve,auc
fpr,tpr,threshold = roc_curve(y_test,y_pred)
auc(fpr,tpr)

In [None]:
plt.plot(fpr,tpr,marker='.')
plt.xlabel('false positive rate')
plt.ylabel('true postive rate')
plt.show()

In [None]:
import pickle
pickle.dump(rf, open('model.pkl','wb'))

In [None]:
sol = []
sol.append({'model':'RandomForest','precision_score':precision_score(y_test, y_pred),'recall_score':recall_score(y_test, y_pred),'roc_auc score':auc(fpr,tpr)})

# 2) Logistic Regression: 

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [None]:
y_pred=log_reg.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print('precision:', precision_score(y_test,y_pred))
print('recall:',recall_score(y_test,y_pred))
print('f1_score:',f1_score(y_test,y_pred))

In [None]:
print('accuracy_score:',accuracy_score(y_test,y_pred))

In [None]:
fpr,tpr,threshold = roc_curve(y_test,y_pred)
print('AUC score:',auc(fpr,tpr))

In [None]:
plt.plot(fpr,tpr,marker='.')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()

In [None]:
sol.append({'model':'Logistic Regression','precision_score':precision_score(y_test, y_pred),'recall_score':recall_score(y_test, y_pred),'roc_auc score':auc(fpr,tpr)})

# 3) Perceptron: 

In [None]:
from sklearn.linear_model import Perceptron
perc = Perceptron(penalty='l2')
perc.fit(X_train,y_train)

In [None]:
y_pred = perc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print('precision:', precision_score(y_test,y_pred))
print('recall:',recall_score(y_test,y_pred))
print('f1_score:',f1_score(y_test,y_pred))

In [None]:
print('accuracy_score:',accuracy_score(y_test,y_pred))

In [None]:
fpr,tpr,threshold = roc_curve(y_test,y_pred)
print('AUC score:',auc(fpr,tpr))

In [None]:
plt.plot(fpr,tpr,marker='.')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()

In [None]:
sol.append({'model':'Perceptron','precision_score':precision_score(y_test, y_pred),'recall_score':recall_score(y_test, y_pred),'roc_auc score':auc(fpr,tpr)})

# 4) Decision Tree Classifier:

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train,y_train)
y_pred = decision_tree.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print('precision:', precision_score(y_test,y_pred))
print('recall:',recall_score(y_test,y_pred))
print('f1_score:',f1_score(y_test,y_pred))

In [None]:
print('accuracy_score:',accuracy_score(y_test,y_pred))

In [None]:
fpr,tpr,threshold = roc_curve(y_test,y_pred)
print('AUC score:',auc(fpr,tpr))

In [None]:
plt.plot(fpr,tpr,marker='.')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()

In [None]:
sol.append({'model':'Decision Tree Classifier','precision_score':precision_score(y_test, y_pred),'recall_score':recall_score(y_test, y_pred),'roc_auc score':auc(fpr,tpr)})

# 5) SVM (RBF Kernel)

In [None]:
from sklearn.svm import SVC
clf = SVC(kernel='rbf')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [None]:
print('accuracy_score:',accuracy_score(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))
print('precision:', precision_score(y_test,y_pred))
print('recall:',recall_score(y_test,y_pred))
print('f1_score:',f1_score(y_test,y_pred))

In [None]:
fpr,tpr,threshold = roc_curve(y_test,y_pred)
print('AUC score:',auc(fpr,tpr))

In [None]:
plt.plot(fpr,tpr,marker='.')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()

In [None]:
sol.append({'model':'SVM (RBF Kernel)','precision_score':precision_score(y_test, y_pred),'recall_score':recall_score(y_test, y_pred),'roc_auc score':auc(fpr,tpr)})

# 6) SVM ( Linear Kernel):

In [None]:
clf = SVC(kernel='linear')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print('precision:', precision_score(y_test,y_pred))
print('recall:',recall_score(y_test,y_pred))
print('f1_score:',f1_score(y_test,y_pred))

In [None]:
print('accuracy_score:',accuracy_score(y_test,y_pred))

In [None]:
fpr,tpr,threshold = roc_curve(y_test,y_pred)
print('AUC score:',auc(fpr,tpr))

In [None]:
plt.plot(fpr,tpr,marker='.')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()

In [None]:
sol.append({'model':'SVM (Linear Kernel)','precision_score':precision_score(y_test, y_pred),'recall_score':recall_score(y_test, y_pred),'roc_auc score':auc(fpr,tpr)})

# END

In [None]:
df={}
for i in range(len(sol)):
    df[i]=pd.Series(sol[i])

In [None]:
for i in range(len(df)):
    solution=pd.concat(df,axis=1)

### From the below table, we can see that Random forest has the highest accuracy among them all. 

In [None]:
solution