# Tree-based Models

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (10, 6)

In [2]:
from sklearn import preprocessing

In [3]:
df = pd.read_csv("../data/bank.csv")

In [4]:
df_encoded = df.apply(preprocessing.LabelEncoder().fit_transform)

In [5]:
#Let's use duration, age to build the first model
X = df_encoded[["duration", "age"]]
y = df_encoded.deposit

# Decision Tree

In [6]:
import pydotplus 
from IPython.display import Image

In [7]:
from sklearn import tree

In [9]:
#build decision tree
clf = tree.DecisionTreeClassifier(max_depth=2)

In [10]:
clf = clf.fit(X, y)

In [11]:
dot_data = tree.export_graphviz(clf, out_file='tree.dot', 
feature_names=X.columns,
class_names=['No', 'Yes'], filled=True, 
rounded=True, special_characters=True)

In [12]:
graph = pydotplus.graph_from_dot_file('tree.dot')  

In [None]:
Image(graph.create_png()) 

# Exercise

In [21]:
#1. Find accuracy metrics
#2. Run with max_depth = 4, and max_depth=None.
#3. Report accuracy metrics on them

# Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
clf = RandomForestClassifier()

In [24]:
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [25]:
clf.feature_importances_

array([ 0.79550621,  0.20449379])

### out-of-bag error

The out-of-bag (OOB) error is the average error for each training observation calculated using predictions from the trees that do not contain it in their respective bootstrap sample. This allows the RandomForest to be fit and validated whilst being trained.

In [28]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
clf2 = RandomForestClassifier(warm_start=True, oob_score=True, max_features=None)

In [30]:
clf2.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=True)

In [31]:
clf2.oob_score_

0.67299767066833904

# Exercise

In [32]:
#1. Find accuracy metrics
#2. Run with max_depth = 4, and max_depth=None.
#3. Report accuracy metrics on them

# Train-Test Split

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
test_size=0.33, random_state=42)

In [35]:
clf3 = RandomForestClassifier(warm_start=True, oob_score=True, max_features=None)

In [36]:
clf3.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=True)

In [37]:
prediction = clf3.predict_proba(X_test)

In [38]:
from sklearn.metrics import roc_auc_score

In [40]:
roc_auc_score(y_test, prediction[:,1])

0.74860945420579406

In [41]:
from sklearn.metrics import confusion_matrix

In [43]:
prediction_binary = clf3.predict(X_test)

In [45]:
confusion_matrix(y_test, prediction_binary)

array([[1382,  535],
       [ 621, 1146]])

In [46]:
from sklearn.metrics import accuracy_score

In [47]:
accuracy_score(y_test, prediction_binary)

0.68621064060803472

# Exercise

#1. Build Random Forest model with all features 
#2. Report accuracy metrics on test