In [1]:
import pandas as pd
import numpy as np

In [2]:
hd = pd.read_csv('HeartDisease.csv')

In [3]:
hd.head()

Unnamed: 0,age,gender,chest_pain,rest_bps,cholestrol,fasting_blood_sugar,rest_ecg,thalach,exer_angina,old_peak,slope,ca,thalassemia,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
hd.isnull().sum()

age                    0
gender                 0
chest_pain             0
rest_bps               0
cholestrol             0
fasting_blood_sugar    0
rest_ecg               0
thalach                0
exer_angina            0
old_peak               0
slope                  0
ca                     0
thalassemia            0
target                 0
dtype: int64

In [5]:
hd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  303 non-null    int64  
 1   gender               303 non-null    int64  
 2   chest_pain           303 non-null    int64  
 3   rest_bps             303 non-null    int64  
 4   cholestrol           303 non-null    int64  
 5   fasting_blood_sugar  303 non-null    int64  
 6   rest_ecg             303 non-null    int64  
 7   thalach              303 non-null    int64  
 8   exer_angina          303 non-null    int64  
 9   old_peak             303 non-null    float64
 10  slope                303 non-null    int64  
 11  ca                   303 non-null    int64  
 12  thalassemia          303 non-null    int64  
 13  target               303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [6]:
from sklearn.model_selection import train_test_split
hd_train, hd_test = train_test_split(hd, test_size = .2)

In [9]:
hd_x_train = hd_train.iloc[:,0:-1]
hd_y_train = hd_train.iloc[:,-1]

hd_x_test = hd_test.iloc[:,0:-1]
hd_y_test = hd_test.iloc[:,-1]

In [11]:
print(hd_x_train.shape)
print(hd_y_train.shape)
print('----------------')
print(hd_x_test.shape)
print(hd_y_test.shape)

(242, 13)
(242,)
----------------
(61, 13)
(61,)


# Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier
dtc_hd = DecisionTreeClassifier(class_weight='balanced')

In [26]:
dtc_hd.fit(hd_x_train, hd_y_train)

DecisionTreeClassifier(class_weight='balanced')

In [27]:
pred_hd = dtc_hd.predict(hd_x_test)

In [28]:
from sklearn.metrics import confusion_matrix
tab_hd = confusion_matrix(pred_hd, hd_y_test)
tab_hd

array([[16,  6],
       [ 9, 30]], dtype=int64)

In [29]:
Acc = tab_hd.diagonal().sum()*100/ tab_hd.sum()
Acc

0.7540983606557377

# Feature Imortance

In [30]:
feature_imp = pd.DataFrame({'Importance':dtc_hd.feature_importances_,'columns':hd_x_train.columns})
feature_imp.sort_values('Importance', ascending=False)

Unnamed: 0,Importance,columns
2,0.254014,chest_pain
12,0.157977,thalassemia
11,0.130603,ca
4,0.125696,cholestrol
0,0.096808,age
9,0.069145,old_peak
7,0.066825,thalach
1,0.042127,gender
3,0.027114,rest_bps
10,0.013162,slope


# Grid Search

In [24]:
from sklearn.model_selection import GridSearchCV

In [31]:
search_dict = {'criterion' : ('entropy','gini'),
               'max_depth' : (4,5,6,7),
               'min_samples_split' : (50,75,100,150,200)}

In [32]:
grid = GridSearchCV(dtc_hd, param_grid=search_dict)

In [34]:
grid.fit(hd_x_train, hd_y_train)

GridSearchCV(estimator=DecisionTreeClassifier(class_weight='balanced'),
             param_grid={'criterion': ('entropy', 'gini'),
                         'max_depth': (4, 5, 6, 7),
                         'min_samples_split': (50, 75, 100, 150, 200)})

In [35]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 50}

In [36]:
pred_grid = grid.predict(hd_x_test)

In [39]:
tab_hd_grid = confusion_matrix(pred_grid, hd_y_test)
tab_hd_grid

array([[18,  8],
       [ 7, 28]], dtype=int64)

In [50]:
tab_hd_grid.diagonal().sum() * 100/tab_hd_grid.sum()

75.40983606557377

# Random Forest 

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
rfc = RandomForestClassifier(n_estimators = 100)

In [43]:
rfc.fit(hd_x_train, hd_y_train)

RandomForestClassifier()

In [44]:
pred_rfc = rfc.predict(hd_x_test)

In [46]:
abc = confusion_matrix(pred_rfc, hd_y_test)

In [48]:
abc.diagonal().sum() * 100/abc.sum()

81.9672131147541