In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('heart_disease.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2.0,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1.0,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1.0,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0.0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df.isnull().sum()

age          0
sex          0
cp          10
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
target       0
dtype: int64

As the coulmn cp (chest pain) has missing values, we need to impute the data.
The data is numeric and hence mean stratergy will be a suitable choice.

In [5]:
from sklearn.preprocessing import Imputer
imput = Imputer(missing_values='NaN',strategy='mean')
df = list(imput.fit_transform(df))

for i in range(303):
    for j in range(14):
        df[i][j] = math.ceil(df[i][j])



In [6]:
df = pd.DataFrame(df)

In [7]:
df=df.rename(columns={0: 'age', 1:'sex', 2:'cp', 3:'trestbps',4: 'chol',5: 'fbs',6: 'restecg',7: 'thalach',8: 'exang',9: 'oldpeak',10: 'slope',11: 'ca',12: 'thal',13:'target'})

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
x = df.iloc[:,:-1]

In [10]:
y = df.iloc[:,-1]

In [None]:
#  to find the most important features in the dataset

In [11]:
model= RandomForestClassifier(n_estimators=100,random_state=0)
model.fit(x,y)
pd.Series(model.feature_importances_,index=x.columns).sort_values(ascending=False)

thal        0.127732
ca          0.117530
thalach     0.115284
cp          0.114049
age         0.092933
chol        0.086188
trestbps    0.079302
oldpeak     0.072846
exang       0.065039
slope       0.062542
sex         0.036971
restecg     0.019255
fbs         0.010329
dtype: float64

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [None]:
# For 'cp' column, 
#it records chest pain type. 
#Number 3 mean no chest pain, number 0-2 means different tyoe of angina. 

# To simplify it, 
# group the number0-2 together as disease positive, 
#number 3 as disease negative

In [13]:
number=[0,1,2]
for col in df.itertuples():

    if col.cp in number:
        df['cp'].replace(to_replace=col.cp, value=1, inplace=True)

In [14]:
df_top8 = df.loc[:,['cp','oldpeak','thal','ca','thalach','age','chol','trestbps','exang']]

In [15]:
x_train,x_test,y_train,y_test = train_test_split(df_top8,y,test_size=0.25,random_state=0)
clf = RandomForestClassifier()
clf.fit(x_train,y_train)
prediction = clf.predict(x_test)
accuracy = accuracy_score(prediction,y_test)
cm = confusion_matrix(prediction,y_test)
prfs = precision_recall_fscore_support(prediction,y_test)
print('Accuracy: ',accuracy)
print('\n')
print('Confusion Matrix: ',cm)
print('\n')
print('Precision: ', prfs[0])
print('Recall:    ', prfs[1])
print('Fscore:    ', prfs[2])
print('Support:   ', prfs[3])

Accuracy:  0.7763157894736842


Confusion Matrix:  [[26 10]
 [ 7 33]]


Precision:  [0.78787879 0.76744186]
Recall:     [0.72222222 0.825     ]
Fscore:     [0.75362319 0.79518072]
Support:    [36 40]




In [16]:
maxim = 0
n_estimators=0
max_depth=0
max_cm=0
max_prfs=0
max_features=0
for i in range(5,15):
    for j in range(5,15):
        for k in range(5,13):
            x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=0)
            clf = RandomForestClassifier(n_estimators=i,max_depth=j,max_features=k)
            clf.fit(x_train,y_train)
            prediction = clf.predict(x_test)
            accuracy = accuracy_score(prediction,y_test)
            cm = confusion_matrix(prediction,y_test)
            prfs = precision_recall_fscore_support(prediction,y_test)
            if accuracy > maxim:
                maxim = accuracy
                n_estimators = i
                max_depth = j
                max_features = k
                max_cm = cm
                max_prfs=prfs
                
print(str(i)+" "+str(j)+" "+str(k)+" "+str(maxim))
print('\n')
print('Confusion Matrix: ',cm)
print('\n')
print('Precision: ', prfs[0])
print('Recall:    ', prfs[1])
print('Fscore:    ', prfs[2])
print('Support:   ', prfs[3])

14 14 12 0.881578947368421


Confusion Matrix:  [[28 11]
 [ 5 32]]


Precision:  [0.84848485 0.74418605]
Recall:     [0.71794872 0.86486486]
Fscore:     [0.77777778 0.8       ]
Support:    [39 37]


In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
x = df.iloc[:,:-1]
x_std = StandardScaler().fit_transform(x)

In [19]:
maxim = 0
n_estimators=0
max_depth=0
max_features=0
for i in range(5,15):
    for j in range(5,15):
        for k in range(5,13):
            x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=0)
            clf = RandomForestClassifier(n_estimators=i,max_depth=j,max_features=k)
            clf.fit(x_train,y_train)
            prediction = clf.predict(x_test)
            accuracy = accuracy_score(prediction,y_test)
            if accuracy > maxim:
                maxim = accuracy
                n_estimators = i
                max_depth = j
                max_features = k
print(str(i)+" "+str(j)+" "+str(k)+" "+str(maxim))

14 14 12 0.881578947368421


In [None]:
#Random forest classifier helped in achieving accuracy upto 88.15 % 
#which is very good cosidering the size and quality of data