## Random Forests

A comparison as to how random forests fare against the bagging. From a theoretical point of view, random forests should outperform bagging. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pydot

from IPython.display import Image
from six import StringIO  
from sklearn.tree import export_graphviz, DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

sns.set_style('whitegrid')

# This function creates images of tree models using pydot
def print_tree(estimator, features, class_names=None, filled=True):
    dot_data = StringIO()
    export_graphviz(estimator, 
                    out_file=dot_data, 
                    feature_names=features, 
                    class_names=class_names, 
                    filled=filled)
    
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    
    return graph 

In [2]:
HEART_DATA = '../datasets/Heart.csv'
df = pd.read_csv(HEART_DATA)

df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [3]:
df.shape

(303, 15)

In [4]:
df = df.dropna()

## Factorize Categorical Variables

In [5]:
CATEGORICAL = ['ChestPain', 'Thal', 'AHD']

# pd.factorize returns the code, uniques (the unique values of the categorical variable)
for cat in CATEGORICAL:
    df[cat] = pd.factorize(df[cat])[0]

## Test-Train Split

In [6]:
df_train = df.sample(200)
df_test = df[~df.index.isin(df_train.index)]

X_train, y_train = df_train.drop('AHD', axis=1), df_train['AHD']
X_test, y_test = df_test.drop('AHD', axis=1), df_test['AHD']

In [7]:
X_train.shape, y_train.shape

((200, 14), (200,))

## Bagging

In [8]:
clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), 
                        n_estimators=1000, 
                        max_samples=len(X_train), 
                        max_features=len(X_train.columns), 
                        oob_score=True, 
                        n_jobs=4)

clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=14,
                  max_samples=200, n_estimators=1000, n_jobs=4, oob_score=True)

In [9]:
clf.score(X_test, y_test)

0.7938144329896907

## Random Forests

In [10]:
clf_rf = RandomForestClassifier(n_estimators=1000, 
                                max_features='log2', 
                                oob_score=True,
                                bootstrap=True,
                                n_jobs=4)

clf_rf.fit(X_train, y_train)

RandomForestClassifier(max_features='log2', n_estimators=1000, n_jobs=4,
                       oob_score=True)

In [11]:
clf_rf.score(X_test, y_test)

0.845360824742268