In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv('online_heart_dataset.csv')
df.head()

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0


In [5]:
df.shape

(270, 5)

In [6]:
df.isnull().sum()

age              0
sex              0
BP               0
cholestrol       0
heart disease    0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   age            270 non-null    int64
 1   sex            270 non-null    int64
 2   BP             270 non-null    int64
 3   cholestrol     270 non-null    int64
 4   heart disease  270 non-null    int64
dtypes: int64(5)
memory usage: 10.7 KB


In [9]:
df['heart disease'].nunique()

2

In [12]:
X = df.drop('heart disease', axis=1)
X

Unnamed: 0,age,sex,BP,cholestrol
0,70,1,130,322
1,67,0,115,564
2,57,1,124,261
3,64,1,128,263
4,74,0,120,269
...,...,...,...,...
265,52,1,172,199
266,44,1,120,263
267,56,0,140,294
268,57,1,140,192


In [13]:
y = df['heart disease']
y

0      1
1      0
2      1
3      0
4      0
      ..
265    0
266    0
267    0
268    0
269    1
Name: heart disease, Length: 270, dtype: int64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Model Building

### Without using Hyperparameter

In [15]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [16]:
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score

In [19]:
accuracy_score(y_train, y_train_pred)

1.0

In [20]:
accuracy_score(y_test, y_test_pred)

0.6419753086419753

## Hyperparameter Tuning

In [21]:
rf = RandomForestClassifier(n_estimators=10, max_depth=3, min_samples_split=50, verbose=1, random_state=42, oob_score=True)
rf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [22]:
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [24]:
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

0.7142857142857143
0.6790123456790124


### Using Gridsearch for Hypertuning

In [25]:
 from sklearn.model_selection import GridSearchCV

In [28]:
rf = RandomForestClassifier(random_state=42)

In [35]:
params = {
    'max_depth': [2,3,5,7],
    'min_samples_split':[5, 10, 20, 50],
    'max_features': [2,3,4],
    'n_estimators': [10,20,30,40,50]   
}

In [36]:
grid_search = GridSearchCV(estimator=rf,
                          param_grid=params,
                          cv=5, verbose=1)

In [37]:
%%time
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
CPU times: total: 39 s
Wall time: 39.3 s


In [38]:
rf_best = grid_search.best_estimator_
rf_best

In [39]:
rf_best.feature_importances_

array([0.39689435, 0.14743529, 0.20906564, 0.24660473])

In [40]:
imp_df = pd.DataFrame({
    'variable': X_train.columns,
    'Imp': rf_best.feature_importances_
})

In [41]:
imp_df.sort_values(by='Imp', ascending=False)

Unnamed: 0,variable,Imp
0,age,0.396894
3,cholestrol,0.246605
2,BP,0.209066
1,sex,0.147435


In [42]:
rf_best.oob_score

False