# Random Forest & Ensembles

In [2]:
import numpy as np
import pandas as pd
import xlrd
import os
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,\
ExtraTreesClassifier, VotingClassifier, StackingRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Objectives

- Use `sklearn` to build voting models
- Describe the algorithm of bagging
- Describe the differences among simple bagging, random forest, and extra trees algorithms
- Implement bagging models in `sklearn`

# Ensemble Methods

Because many heads are better than one!

<img width=50% src='images/captain_planet.jpg'/>

> "With our powers combined..."

These models tend to perform very well and generalize well!

## Advantages &  Disadvantages

- Decreases variance → Less overfitting!
- More complexity (you have to train each model or part of model)
- Tends to take up more space (have to keep each model)
- More complexity can lead to less interpretation
- Needs more computational power and time

## Bagging 

![](images/bag_of_marbles.jpg)

- Many models naturally overfit
- Randomize data/features → New models
- New models overfit in different ways
- Aggregation → Smooth over different ways of overfitting to reduce variance

> Low variance since it averages out quirks individual models might've learned

#### Aggregation

- **B**ootstrap **AGG**regating
- Algorithm to repeat many times:
    + Create multiple samples from your data
    + Train models (e.g. a decision tree) on those samples
- Final model predicts by averaging or voting across those many models

#### Three Varieties, Three Levels of Randomization

1. **Simple Bag**: Train each model on random sample
2. **Random Forest**: Choose a random set of features at each decision point
3. **Extra Trees**: Choose split path at random instead of based on gini or entropy (still uses random set of features)

And of course, we have a bit of control over this things via hyperparameter tuning, lots more knobs to turn!
- n_estimators: number of sub-models to train
- max_features: feature subsetting
- bootstrap: random sampling or not
    - max_samples: size of samples

## Data Preparation for Examples

> Let's prepare some data to do some examples

In [3]:
df = pd.read_csv('data/cars.csv')
df.head()

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
0,14.0,8,350,165,4209,12,1972,US.
1,31.9,4,89,71,1925,14,1980,Europe.
2,17.0,8,302,140,3449,11,1971,US.
3,15.0,8,400,150,3761,10,1971,US.
4,30.5,4,98,63,2051,17,1978,US.


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           261 non-null    float64
 1    cylinders    261 non-null    int64  
 2    cubicinches  261 non-null    object 
 3    hp           261 non-null    int64  
 4    weightlbs    261 non-null    object 
 5    time-to-60   261 non-null    int64  
 6    year         261 non-null    int64  
 7    brand        261 non-null    object 
dtypes: float64(1), int64(4), object(3)
memory usage: 16.4+ KB


In [5]:
df.isna().sum().sum()

0

### Defining Our Problem

Let's see if we can predict whether a car is American or not.

In [6]:
df[' brand'].value_counts()

 US.        162
 Japan.      51
 Europe.     48
Name:  brand, dtype: int64

In [7]:
df['target'] = df[' brand'] == ' US.'

In [8]:
df.head()

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand,target
0,14.0,8,350,165,4209,12,1972,US.,True
1,31.9,4,89,71,1925,14,1980,Europe.,False
2,17.0,8,302,140,3449,11,1971,US.,True
3,15.0,8,400,150,3761,10,1971,US.,True
4,30.5,4,98,63,2051,17,1978,US.,True


### Fix Columns with Missing Values

In [9]:
# Convert cubicinches and weightlbs to numbers
df[df[' cubicinches'] == ' ']

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand,target
40,16.0,6,,105,3897,19,1976,US.,True
180,19.8,6,,85,2990,18,1980,US.,True


In [10]:
df[' cubicinches'] = df[' cubicinches'].map(lambda x: np.nan if x == ' ' else int(x))

In [11]:
df[df[' weightlbs'] == ' ']

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand,target
14,19.1,6,225.0,90,,19,1981,US.,True
33,21.0,6,199.0,90,,15,1971,US.,True
172,29.0,4,68.0,49,,20,1974,Europe.,False


In [12]:
df[' weightlbs'] = df[' weightlbs'].map(lambda x: np.nan if x == ' ' else int(x))

In [13]:
df.isna().sum()

mpg             0
 cylinders      0
 cubicinches    2
 hp             0
 weightlbs      3
 time-to-60     0
 year           0
 brand          0
target          0
dtype: int64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['target', ' brand'], axis=1), df['target'], random_state=42)

In [15]:
si = SimpleImputer()

si.fit(X_train)

X_tr_im = si.transform(X_train)
X_te_im = si.transform(X_test)

## Bagging

A single decision tree will often overfit your training data. Let's see if we have evidence of that in the current case:

In [16]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_tr_im, y_train)
dt.score(X_tr_im, y_train)

1.0

In [17]:
cross_val_score(dt, X_tr_im, y_train).mean()

0.8564102564102564

<details>
    <summary><b>🧠 Knowledge Check</b>: What is this score? And why is it equal to 1?</summary>
    <br/>
    <quote>
    This perfect score on the training data is already evidence of model overfitting. There are steps one can take to help with this, like limiting the "depth" of the nodes. And of course we can use cross-validation to get a more honest estimate of model quality.
    </quote>
</details>




In [18]:
scores = cross_val_score(estimator=dt, X=X_tr_im,
                y=y_train, cv=5)
scores

array([0.8974359 , 0.87179487, 0.82051282, 0.8974359 , 0.79487179])

In [19]:
np.mean(scores)

0.8564102564102564

But it's often better to do something else: Plant another tree!

Of course, if a second tree is going to be of any value, it has to be *different from* the first. Here's a good algorithm for achieving that:

### Bagging Algorithm

- Take a sample of your X_train and fit a decision tree to it.
- Replace the first batch of data and repeat.
- When you've got as many trees as you like, make use of all your individual trees' predictions to come up with some holistic prediction. 
    - (Most obviously, we could take the average of our predictions, but there are other methods we might try.)
* Because we're resampling our data with replacement, we're *bootstrapping*.
* Because we're making use of our many samples' predictions, we're *aggregating*.
* Because we're bootstrapping and aggregating all in the same algorithm, we're *bagging*.

### Bagging with `sklearn`

In [20]:
# Instatiate a BaggingRegessor
# Note the base esimator is by default a decision tree
bag = BaggingClassifier(n_estimators=100, random_state=42)

In [21]:
# Fit it

bag.fit(X_tr_im, y_train)

BaggingClassifier(n_estimators=100, random_state=42)

In [22]:
# Cross-validation

scores = cross_val_score(estimator=bag, X=X_tr_im,
               y=y_train, cv=5)
scores

array([0.8974359 , 0.92307692, 0.84615385, 0.92307692, 0.82051282])

In [23]:
np.mean(scores)

0.8820512820512821

### Fitting a Random Forest

### An Aside Story - Bananas 🍌

Banana trees can be susceptible to [Panama's disease](https://en.wikipedia.org/wiki/Panama_disease)

![Many individual yellow bananas](images/bananas.jpg)

They're all clones!

Similarly, all the Decision Trees will be the same if given the same data! (A clone!!!)

### The Goods & The Bads

**The Goods**

- Super friend! 
- High performance 
    + low variance
- Transparent
    + inherited from Decision Trees
    

**The Bads**

- We got so many trees to plant...
- Computationally expensive
- Memory
    + all trees stored in memory
    + think back to k-Nearest Neighbors

### Breed a Variety of Trees

Let's add an extra layer of randomization: Instead of using *all* the features of my model to optimize a branch at each node, I'll just choose a subset of my features.

That's the essence of a random forest model. Note that there are now **two** levels of random sampling happening: To build a new tree, I'll be taking only some of my data points; and at any branching point in a tree, I'll be using only some of my features to determine the split.

### Random Forest with `sklearn`

> Here's the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier) on `RandomForestClassifier`

In [24]:
X_tr_im.shape

(195, 7)

In [25]:
np.sqrt(7)

2.6457513110645907

In [26]:
# Instantiate a RandomForestClassifier

rfc = RandomForestClassifier(random_state=1)

In [27]:
# Fit it

rfc.fit(X_tr_im, y_train)

RandomForestClassifier(random_state=1)

In [28]:
# Cross-validation

scores = cross_val_score(estimator=rfc, X=X_tr_im,
               y=y_train, cv=5)
scores

array([0.84615385, 0.94871795, 0.87179487, 0.8974359 , 0.82051282])

In [29]:
np.mean(scores)

0.8769230769230768

In [30]:
rfc.score(X_tr_im, y_train)

1.0

### Cool Features of Random Forests

There are some extra investigations we can do with random forests since they're built of decision trees.

> **NOTE**
>
> Not all of these are _specific_ to random forests and can be applied to other (ensemble) models

#### Investigate Your Forest 🌲🌲👀🌲🌲

We can check out our trained estimators after training the ensemble. This isn't necessarily unique to random forests, but since the base model is always a decision tree we can really investigate how the model is working!

In [31]:
model_estimators = rfc.estimators_ 
print(len(model_estimators))
model_estimators

100


[DecisionTreeClassifier(max_features='auto', random_state=1791095845),
 DecisionTreeClassifier(max_features='auto', random_state=2135392491),
 DecisionTreeClassifier(max_features='auto', random_state=946286476),
 DecisionTreeClassifier(max_features='auto', random_state=1857819720),
 DecisionTreeClassifier(max_features='auto', random_state=491263),
 DecisionTreeClassifier(max_features='auto', random_state=550290313),
 DecisionTreeClassifier(max_features='auto', random_state=1298508491),
 DecisionTreeClassifier(max_features='auto', random_state=2143362693),
 DecisionTreeClassifier(max_features='auto', random_state=630311759),
 DecisionTreeClassifier(max_features='auto', random_state=1013994432),
 DecisionTreeClassifier(max_features='auto', random_state=396591248),
 DecisionTreeClassifier(max_features='auto', random_state=1703301249),
 DecisionTreeClassifier(max_features='auto', random_state=799981516),
 DecisionTreeClassifier(max_features='auto', random_state=1666063943),
 DecisionTreeCl

In [32]:
print(f'Overall model\'s score was {rfc.score(X_te_im, y_test):.3f}')
print('='*70)

for model in model_estimators[-5:]:
    display(model)
    model_score = model.score(X_te_im, y_test)
    print(f'\tModel gave score of {model_score:.3f}')

Overall model's score was 0.864


DecisionTreeClassifier(max_features='auto', random_state=1732461694)

	Model gave score of 0.773


DecisionTreeClassifier(max_features='auto', random_state=1235985687)

	Model gave score of 0.818


DecisionTreeClassifier(max_features='auto', random_state=513207677)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='auto', random_state=558468452)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='auto', random_state=106512539)

	Model gave score of 0.682


#### Feature Importance

We can use [`.feature_importances_`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier.feature_importances_) property of the trained model to get an idea of what features mattered the most

In [33]:
feat_import = {name: score 
                   for name, score 
                       in zip(X_train.columns, rfc.feature_importances_)
}
feat_import

{'mpg': 0.09155699733175204,
 ' cylinders': 0.12287901002646268,
 ' cubicinches': 0.3309205653474897,
 ' hp': 0.1307102663191352,
 ' weightlbs': 0.18587455226810898,
 ' time-to-60': 0.05895102923174991,
 ' year': 0.07910757947530153}

### Extremely Randomized Trees (Extra Trees)

Sometimes we might want even one more bit of randomization. Instead of always choosing the *optimal* branching path, we might just choose a branching path at random. If we're doing that, then we've got extremely randomized trees.

There are now **three** levels of randomization: sampling of data, sampling of features, and random selection of branching paths.

In [None]:
# Instantiate an ExtraTreesClassifier
# Need to look at bootstrap arguement here if we want 'random' samples
etc = ExtraTreesClassifier(bootstrap=True, random_state=1)

In [None]:
# Fit it

etc.fit(X_tr_im, y_train)

In [None]:
# Cross-validation

scores = cross_val_score(estimator=etc, X=X_tr_im,
               y=y_train, cv=5)
scores

In [None]:
np.mean(scores)

In [None]:
feat_import = {name: score 
                   for name, score 
                       in zip(X_train.columns, etc.feature_importances_)
}
feat_import

## Averaging different model types

> Each model uses the same data to train and then we "vote" to make a prediction

> Benefit here is you can 'ensemble' multiple algorithm types together

### Model 1 - Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42)

lr.fit(X_tr_im, y_train)

In [None]:
scores = cross_val_score(estimator=lr, X=X_tr_im,
                        y=y_train, cv=5)
scores

In [None]:
np.mean(scores)

In [None]:
lr.score(X_tr_im, y_train)

### Model 2 - KNN

In [None]:
knn = KNeighborsClassifier(3)

knn.fit(X_tr_im, y_train)

In [None]:
scores = cross_val_score(estimator=knn, X=X_tr_im,
                y=y_train, cv=5)
np.mean(scores)

In [None]:
knn.score(X_tr_im, y_train)

### Model 3 - Decision Tree

In [None]:
ct = DecisionTreeClassifier(random_state=42)

ct.fit(X_tr_im, y_train)

In [None]:
scores = cross_val_score(estimator=ct, X=X_tr_im,
               y=y_train, cv=5)
scores

In [None]:
np.mean(scores)

In [None]:
ct.score(X_tr_im, y_train)

### Averaging the Models

#### Building a `VotingClassifier`

> Of course there's a Scikit-Learn class for that!

In [None]:
avg = VotingClassifier(estimators=[
    ('lr', lr),
    ('knn', knn),
    ('ct', ct)])

avg.fit(X_tr_im, y_train)

In [None]:
scores = cross_val_score(estimator=avg, X=X_tr_im,
               y=y_train, cv=5)
scores

In [None]:
np.mean(scores)

In [None]:
avg.score(X_te_im, y_test)

#### Weighted Averaging with the `VotingClassifier`

> Even if the vote is 50-50, you'd probably side with the "smart" ones more

This meta-estimator is not as good as one of our base estimators, so in this case the averaging did not work very well. Realizing that the logistic regression is performing better than the decision tree and the k-nearest-neighbors model, however, we might decide to build a meta-estimator by calculating a **weighted average** of the base estimators' predictions. And we can weight, or bias, this estimator in favor of the best-performing base estimator. Suppose we weight the logistic regression 50%, the knn model 25%, and the logistic regression 25%:

In [None]:
w_avg = VotingClassifier(estimators=[
    ('lr', lr),
    ('knn', knn),
    ('ct', ct)],
    weights=[0.5, 0.25, 0.25])
w_avg.fit(X_tr_im, y_train)

In [None]:
scores = cross_val_score(estimator=w_avg, X=X_tr_im,
                        y=y_train, cv=5)
scores

In [None]:
np.median(scores)

In [None]:
w_avg.score(X_te_im, y_test)

In [None]:
avg.estimators_

In [None]:
avg.estimators_[0].coef_

# Level Up: Stacking

#### Meta-Classifier/Meta-Regressor

- First, we ask several different models to make predictions about the target
- Rather than taking a simple average or vote to determine the outcome, feed these results into a final model that makes the prediction based on the other models’ predictions
- If it seems like we are approaching a neural network...you are correct!

Remember weighted averaging? Stacking is about using DS models to estimate those weights for us. This means we'll have one layer of base estimators and another layer that is "**trained to optimally combine the model predictions to form a new set of predictions**". See [this short blog post](https://blogs.sas.com/content/subconsciousmusings/2017/05/18/stacked-ensemble-models-win-data-science-competitions/) for more.

## Initial Data Prep

In [None]:
wb = xlrd.open_workbook('data/Sales Report.xls',
                        logfile=open(os.devnull, 'w'))

sales = pd.read_excel(wb)
sales = sales.dropna()

In [None]:
sales.head()

In [None]:
sales.dtypes

In [None]:
sales['Category'].value_counts()

In [None]:
sales['Sub-Category'].value_counts()

In [None]:
X_num = sales[['Discount', 'Profit']].columns
X_cat = sales[['Category', 'Sub-Category']].columns

In [None]:
X = sales[['Discount', 'Profit',
          'Category', 'Sub-Category']]
y = sales['Sales']

## Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Setting Up a Pipeline

In [None]:
numTrans = Pipeline(steps=[
    ('scaler', StandardScaler())
])
catTrans = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first',
                          sparse=False))
])

In [None]:
pp = ColumnTransformer(transformers=[
    ('num', numTrans, X_num),
    ('cat', catTrans, X_cat)
])

In [None]:
pp.fit(X_train)

In [None]:
X_tr_pp = pp.transform(X_train)

## Setting Up a Stack

In [None]:
estimators = [('lr', LinearRegression()),
             ('knn', KNeighborsRegressor()),
             ('rt', DecisionTreeRegressor())]

sr = StackingRegressor(estimators=estimators)

In [None]:
sr_pipe = Pipeline(steps=[('tr', pp), ('sr', sr)])

In [None]:
sr_pipe

In [None]:
# Final Ridge, betas would be the weigths for each model (think voting classifier)
y = b0 + b1*x1 + b2*x2 + b3*x3 + some penalty due to l2

In [None]:
sr.fit(X_tr_pp, y_train)

In [None]:
cross_val_score(sr, X_tr_pp, y_train)

In [None]:
X_test_pp = pp.transform(X_test)

In [None]:
sr.score(X_test_pp, y_test)

In [None]:
# Weights
sr.final_estimator_.coef_

In [None]:
sr.get_params()

In [None]:
sr.named_estimators_

## Comparison with Base Estimators

In [None]:
lr = LinearRegression().fit(X_tr_pp, y_train)
lr.score(X_test_pp, y_test)

In [None]:
knn = KNeighborsRegressor().fit(X_tr_pp, y_train)
knn.score(X_test_pp, y_test)

In [None]:
rt = DecisionTreeRegressor().fit(X_tr_pp, y_train)
rt.score(X_test_pp, y_test)