# Importing packages 

In [1]:
# Data wrangling 
import pandas as pd 

# Array math
import numpy as np 

# Ploting 
import seaborn as sns
import matplotlib.pyplot as plt

# List iteration tracking
from tqdm import tqdm

# Importing the custom written class 
from DecisionTree import Node 

# Importing the custom regression tree 
from RandomForest import RandomForestClassifier, RandomForestTree

# Time tracking
import time

# Precision metrics 
from sklearn.metrics import precision_score, recall_score

# Reading data 

The data regards telecom churn. 

The objective is to create a model that predicts whether a customer will quit using the features available.

In [2]:
d = pd.read_csv('heart.csv')

In [3]:
print(f"Data shape: {d.shape}")

Data shape: (303, 14)


In [4]:
d.head(-10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,57,1,0,110,335,0,1,143,1,3.0,1,1,3,0
289,55,0,0,128,205,0,2,130,1,2.0,1,1,3,0
290,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
291,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0


In [5]:
d.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [6]:
# Distribution of target in data 
d.groupby('target').size()

target
0    138
1    165
dtype: int64

# Random forest - quick theory review

The classifier which will be created is a random forest classifier. 

Lets denote it as **rf()**.  

Given a set of input matrix $\mathbb{X}_{nxp}$ the classifier **rf()** outputs either 1 or 0.

$$rf: \mathbb{X} \rightarrow \{1, 0\}$$

The algorithm of the random forest grows **k** decision trees. 

The final prediction of the **rf()** classifier is a majority vote: the input matrix $\mathbb{X}$ is used with each of the **k** trees, and then the class with the most outputs wins. 

In the notebook about decision trees it is clear that with the same input and the same hyperparameters, the same output and the same rules will be learnt by a decision tree. So why grow **k** of them? 

## Data bootstrapping

The random in the random forest starts at the data sample creation for each of the decision trees. The technique used in creating **k** datasamples is bootstrapping

Given a dataset of n rows and p features: we sample the rows from the original dataset with replacement. For every new decision tree *i*, a new bootsrapped dataset is created: $\mathbb{X_{b}^{i}}$.

For example, lets assume that the whole dataset has 10 rows of data:

In [7]:
# Lets imagine this the whole dataset
dsubset = d.sample(10).copy()[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']]
dsubset.reset_index(inplace=True, drop=True)

print(dsubset)

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       128   255    0        1      161      1      0.0      2   
1   53    0   0       130   264    0        0      143      0      0.4      1   
2   60    0   0       150   258    0        0      157      0      2.6      1   
3   59    1   0       170   326    0        0      140      1      3.4      0   
4   50    1   2       129   196    0        1      163      0      0.0      2   
5   53    1   0       123   282    0        1       95      1      2.0      1   
6   54    1   1       192   283    0        0      195      0      0.0      2   
7   58    0   1       136   319    1        0      152      0      0.0      2   
8   60    1   0       140   293    0        0      170      0      1.2      1   
9   47    1   0       110   275    0        0      118      1      1.0      1   

   ca  thal  target  
0   1     3       0  
1   0     2       1  
2   2     3       0  
3   0     3       0 

To create 3 more random bootsrapped samples we use the pandas function **sample(replace=True)**. The key concept is that the sampling is done *with replacement*: the same rows might appear several times in our sample. 

In [8]:
for i, _ in enumerate(range(3)):
    print("----- \n")
    print(f"Boostrapped sample: {i + 1} \n")
    print(dsubset.sample(frac=1.0, replace=True))
    print("----- \n")

----- 

Boostrapped sample: 1 

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
3   59    1   0       170   326    0        0      140      1      3.4      0   
0   52    1   0       128   255    0        1      161      1      0.0      2   
4   50    1   2       129   196    0        1      163      0      0.0      2   
9   47    1   0       110   275    0        0      118      1      1.0      1   
8   60    1   0       140   293    0        0      170      0      1.2      1   
2   60    0   0       150   258    0        0      157      0      2.6      1   
6   54    1   1       192   283    0        0      195      0      0.0      2   
5   53    1   0       123   282    0        1       95      1      2.0      1   
8   60    1   0       140   293    0        0      170      0      1.2      1   
5   53    1   0       123   282    0        1       95      1      2.0      1   

   ca  thal  target  
3   0     3       0  
0   1     3       0  
4   0     

For each of the **k** trees grown in random forest, we create **k** bootstrapped data samples. 

## Feature selection at each split 

Now that we have a dataset $\mathbb{X_{b}^{i}}$ for each of the **k** trees the final part is to determine the splitting criterion for the creation of the nodes. 

In the classification case, the gini gain criterion is the same as in the simple decision tree case. The difference is that at each node splitting, a random subsample of collumns are select to find the "best split". 

For example, if we have 10 collumns as features and we select the hyperparameter of **X_features_fraction = 0.8** then at each node where the best split is beeing calculated, we would select 8 random features (10 * 0.8 = 8).  

# Features to use 

The bellow feature list will be used in the creation of the random forest. 

In [9]:
# Defining the feature list used in the growth of the tree
features = [
    'age', 
    'sex', 
    'cp', 
    'trestbps', 
    'chol', 
    'fbs', 
    'restecg', 
    'thalach', 
    'exang', 
    'oldpeak', 
    'slope', 
    'ca', 
    'thal'
]

In [10]:
d[features + ['target']].sample(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
224,54,1,0,110,239,0,1,126,1,2.8,1,1,3,0
256,58,1,0,128,259,0,0,130,1,3.0,1,2,3,0
222,65,1,3,138,282,1,0,174,0,1.4,1,1,2,0
294,44,1,0,120,169,0,1,144,1,2.8,0,0,1,0
76,51,1,2,125,245,1,0,166,0,2.4,1,0,2,1
153,66,0,2,146,278,0,0,152,0,0.0,1,1,2,1
138,57,1,0,110,201,0,1,126,1,1.5,1,0,1,1
175,40,1,0,110,167,0,0,114,1,2.0,1,0,3,0
276,58,1,0,146,218,0,1,105,0,2.0,1,1,3,0
163,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1


# Creating the train and test sets 

In [11]:
# Fraction of rows in the training set 
train_share = 0.75

# Creating the train and test sets
train = d.sample(frac=train_share)
test = d[~d.index.isin(train.index)].copy()

print(f"Total rows in the dataset: {d.shape[0]}")
print(f"Rows in training set: {train.shape[0]}")
print(f"Rows in test set: {test.shape[0]}")

Total rows in the dataset: 303
Rows in training set: 227
Rows in test set: 76


# Training the random forest 

In [12]:
# Initiating the random forest object 
rf = RandomForestClassifier(
    Y=train['target'], 
    X=train[features],
    min_samples_split=5,
    max_depth=3,
    n_trees=30, # Number of trees grown
    X_features_fraction=0.75
    )

# Growing the random forest 
rf.grow_random_forest()

100%|██████████| 30/30 [00:21<00:00,  1.39it/s]


In [13]:
# Printing out the trees
if rf.n_trees < 10:
    rf.print_trees()

# Predictions

In [14]:
yhat = rf.predict(test[features])
test['yhat'] = yhat

print(f"Total target in test set: {test['target'].sum()}")
print(f"Total predicted target in test set: {test['yhat'].sum()}")

print(f"Precision: {round(precision_score(test['target'], test['yhat']), 2) * 100} %")
print(f"Recall: {round(recall_score(test['target'], test['yhat']), 2) * 100} %")

Total target in test set: 49
Total predicted target in test set: 50
Precision: 86.0 %
Recall: 88.0 %


# Sklearn implementation 

We can compare the custom implementation of RF to that of skicit learn. 

In [15]:
# Skicit learn implementation
from sklearn.ensemble import RandomForestClassifier as RandomForestClassifierScikit

# Initiating
rf_scikit = RandomForestClassifierScikit(n_estimators=30, max_features=0.75, max_depth=3, min_samples_split=5)

# Fitting 
start = time.time()
rf_scikit.fit(X=train[features], y=train['target'])
print(f"Time took for scikit learn: {round(time.time() - start, 2)} seconds")

# Forecasting 
yhatsc = rf_scikit.predict(test[features])
test['yhatsc'] = yhatsc

print(f"Total target in test set: {test['target'].sum()}")
print(f"Total predicted target in test set: {test['yhat'].sum()}")

print(f"Precision: {round(precision_score(test['target'], test['yhatsc']), 2) * 100} %")
print(f"Recall: {round(recall_score(test['target'], test['yhatsc']), 2) * 100} %")

Time took for scikit learn: 0.05 seconds
Total target in test set: 49
Total predicted target in test set: 50
Precision: 86.0 %
Recall: 88.0 %
