# Introduction to Random Forests

The data is from the 1994 census, and contains information on an individual's marital status, age, type of work, and more. The target column, or what we want to predict, is whether individuals make less than or equal to 50k a year, or more than 50k a year.

In [75]:
import math
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [24]:
income = pd.read_csv('income.csv')

In [25]:
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Converting the categorical variables to numeric variables

In [26]:
categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'high_income']
for cat in categorical_cols:
    income[cat] = pd.Categorical(income[cat]).codes

### Splitting our data into training and testing sets

In [27]:
np.random.seed(1)
income = income.reindex(np.random.permutation(income.index))

In [28]:
split_rows = math.floor(0.8 * income.shape[0])

train = income.iloc[:split_rows]
test = income.iloc[split_rows:]

### Using multiple classifiers to make predictions

In [29]:
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

In [30]:
clf = DecisionTreeClassifier(random_state=1, min_samples_leaf = 2)
clf.fit(train[columns], train["high_income"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [31]:
clf2 = DecisionTreeClassifier(random_state=1, max_depth=5)
clf2.fit(train[columns], train["high_income"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [32]:
predictions = clf.predict(test[columns])
roc_auc_score(test["high_income"], predictions)

0.6878964226062301

In [33]:
predictions = clf2.predict(test[columns])
roc_auc_score(test["high_income"], predictions)

0.6759853906508785

### Combining model predictions

In [34]:
predictions = clf.predict_proba(test[columns])[:,1]
predictions2 = clf2.predict_proba(test[columns])[:,1]

In [36]:
combined = np.round((predictions + predictions2) / 2)

In [37]:
roc_auc_score(test["high_income"], combined)

0.7150846804038882

### Introducing variation in a random forest with Bagging

In [45]:
predictions = []

for i in range(10):
    tree = train.sample(frac = 0.6, replace=True, random_state = i)
    
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
    clf.fit(tree[columns], tree["high_income"])
    
    predictions.append(clf.predict_proba(test[columns])[:,1])

predictions    

[array([0.25      , 0.33333333, 0.        , ..., 0.        , 0.66666667,
        0.33333333]),
 array([0.25, 0.  , 0.  , ..., 0.  , 0.  , 0.  ]),
 array([0.25, 0.  , 0.  , ..., 0.  , 1.  , 1.  ]),
 array([0. , 0. , 0. , ..., 0. , 0. , 0.5]),
 array([0.28571429, 0.        , 0.66666667, ..., 0.        , 0.        ,
        0.25      ]),
 array([0.        , 0.        , 0.        , ..., 0.66666667, 1.        ,
        0.5       ]),
 array([0.25      , 0.        , 0.        , ..., 1.        , 0.66666667,
        0.66666667]),
 array([0. , 0. , 0. , ..., 0. , 0. , 0.5]),
 array([0.25, 0.  , 0.  , ..., 0.  , 1.  , 1.  ]),
 array([0.33333333, 0.        , 0.66666667, ..., 1.        , 0.        ,
        0.5       ])]

In [46]:
combined = np.round(np.sum(predictions, axis=0) / 10)
combined

array([0., 0., 0., ..., 0., 0., 1.])

In [48]:
roc_auc_score(test["high_income"], combined)

0.7329963297474371

### Implementing ID3 algorithm with a subset of the features

In [64]:
def calc_entropy(column):
    probabilities = np.bincount(column) / len(column)
    
    entropy = 0
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)
    
    return -entropy

In [65]:
def calc_information_gain(data, split_name, target_name):
    original_entropy = calc_entropy(data[target_name])
    
    column = data[split_name]
    
    left = data[column <= column.median()]
    right = data[column > column.median()]
    
    to_subtract = 0
    for subset in [left, right]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    return original_entropy - to_subtract

In [66]:
tree = {}
nodes = []
np.random.seed(1)

def find_best_column(data, target_name, columns):
    information_gains = []
    
    cols = np.random.choice(columns, 2)
    
    for col in columns:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)
        
    highest_gain_index = information_gains.index(max(information_gains))
    highest_gain = columns[highest_gain_index]
    return highest_gain

In [67]:
def id3(data, target, columns, tree):
    unique_targets = pd.unique(data[target])
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]

    if len(unique_targets) == 1:
        if 0 in unique_targets:
            tree["label"] = 0
        elif 1 in unique_targets:
            tree["label"] = 1
        return
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    tree["column"] = best_column
    tree["median"] = column_median
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [["left", left_split], ["right", right_split]]
    
    for name, split in split_dict:
        tree[name] = {}
        id3(split, target, columns, tree[name])

In [68]:
data = pd.DataFrame([
    [0,4,20,0],
    [0,4,60,2],
    [0,5,40,1],
    [1,4,25,1],
    [1,5,35,2],
    [1,5,55,1]
    ])

data.columns = ["high_income", "employment", "age", "marital_status"]

In [69]:
id3(data, "high_income", ["employment", "age", "marital_status"], tree)

In [72]:
tree

{'column': 'employment',
 'left': {'column': 'age',
  'left': {'column': 'age',
   'left': {'label': 0, 'number': 4},
   'median': 22.5,
   'number': 3,
   'right': {'label': 1, 'number': 5}},
  'median': 25.0,
  'number': 2,
  'right': {'label': 0, 'number': 6}},
 'median': 4.5,
 'number': 1,
 'right': {'column': 'age',
  'left': {'column': 'age',
   'left': {'label': 1, 'number': 9},
   'median': 37.5,
   'number': 8,
   'right': {'label': 0, 'number': 10}},
  'median': 40.0,
  'number': 7,
  'right': {'label': 1, 'number': 11}}}

### Random subset selection in scikit-learn

In [74]:
predictions = []

for i in range(10):
    tree = train.sample(frac = 0.6, replace=True, random_state=i)
    
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2, splitter="random", max_features="auto")
    clf.fit(tree[columns], tree["high_income"])
    
    predictions.append(clf.predict_proba(test[columns])[:,1])

combined = np.round(np.sum(predictions, axis=0) / 10)

roc_auc_score(test["high_income"], combined)

0.7345958637997538

### Using scikit-learn to train and test random forests

In [76]:
clf = RandomForestClassifier(n_estimators=5, random_state=1, min_samples_leaf=2)

clf.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])

roc_auc_score(test["high_income"], predictions)

0.7347461391939776

### Tweaking the parameters for random forests

In [78]:
clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=2)

clf.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])

roc_auc_score(test["high_income"], predictions)

0.7379403213124711

In [79]:
clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=5)

clf.fit(train[columns], train["high_income"])

predictions = clf.predict(train[columns])

roc_auc_score(train["high_income"], predictions)

0.8192570489534683


In [80]:
clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5)

clf.fit(train[columns], train["high_income"])

predictions = clf.predict(train[columns])

roc_auc_score(train["high_income"], predictions)

0.7917047295143252