In [346]:
''' 
Random Forests

    Take a bunch of decision trees trained on portions of the training set, 
    and overfit to their portions. Then random forests are the technique to
    combine these decision trees in order to reduce variance (overfitting).
    
    ALGORITHM:
       -take decision tree and sample from the dataset b times for b decision trees.
        -then train them all
        -then choose output by majority voting for discrete classes, mean for regression problems


    Bagging (Bootstrap aggregating):
    randomly sample from the trainining set and sample some number b of subsets with replacement.
    then train decision trees on each subset B (typically 100 to 1000 trees)
    Predictions can be made to rediuce variance without adding bias by:
        -continuous: average the outputs
        -discrete: take majority vote
    Note: bagging assumes no correlation between models. But there is some!!!


    Stacking:
    take output of each model in a bucker of models ensemble, 
    and pass it as input to a perceptron that combines the outputs to find a best result.
    This is demonstrably better than cross-validation selection.


ENSEMBLES

    Technically, ensemble means generating and averaging different hypotheses from the same model
    and multiple classifier systems covers hybrid systems that use diffrent learners.

    Idea:
    You can overfit on a bunch of models, and then ensemble them.
        This is better than dumbing down models in an effort to reduce overfitting

    -How many hypotheses should we combine in the ensemble?
        There is a theorem, "the law of diminishing returns in ensemble contruction" says 
        that number of classifiers = number of classes is optimal!


    -Cross-validation selection (uses cross validation to select)
        "Bucket of models" method where not sure which model works best.


    ALGORITHM
    Pseudocode:
        write a bunch of models
        break training data into two A and B
        for each model:
            train on A test on B
            record accuracy on B
        Once done, find model with best accuracy on B and use that one!

'''


' \nRandom Forests\n\n    Take a bunch of decision trees trained on portions of the training set, \n    and overfit to their portions. Then random forests are the technique to\n    combine these decision trees in order to reduce variance (overfitting).\n\n\n    Bagging (Bootstrap aggregating):\n    randomly sample from the trainining set and sample some number b of subsets with replacement.\n    then train decision trees on each subset B (typically 100 to 1000 trees)\n    Predictions can be made to rediuce variance without adding bias by:\n        -continuous: average the outputs\n        -discrete: take majority vote\n    Note: bagging assumes no correlation between models. But there is some!!!\n\n\n    Stacking:\n    take output of each model in a bucker of models ensemble, \n    and pass it as input to a perceptron that combines the outputs to find a best result.\n    This is demonstrably better than cross-validation selection.\n\n\nENSEMBLES\n\n    Technically, ensemble means gener

In [348]:
#DECISION TREE CODE

from __future__ import print_function

import os
import subprocess

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz

def get_iris_data():
    """Get the iris data, from local csv or pandas repo."""
    if os.path.exists("iris.csv"):
        print("-- iris.csv found locally")
        df = pd.read_csv("iris.csv", index_col=0)
    else:
        print("-- trying to download from github")
        fn = "https://raw.githubusercontent.com/pydata/pandas/" + \
             "master/pandas/tests/data/iris.csv"
        try:
            df = pd.read_csv(fn)
        except:
            exit("-- Unable to download iris.csv")

        with open("iris.csv", 'w') as f:
            print("-- writing to local iris.csv file")
            df.to_csv(f)

    return df

df = get_iris_data()

#encoding names to integers
def encode_target(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)


#print out data
df2, targets = encode_target(df, "Name")

#print feature column names
features = list(df2.columns[:4])

#writing model
y = df2["Target"]
X = df2[features]
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99) #abstracted away
dt.fit(X, y)

#output matrix
dt.apply(df2[features])

-- iris.csv found locally


array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  9,  5,  5,  5,  5,  5,  5,  7,  5,  5,  5,  5,  5,  7,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10, 10,
       10, 10, 10, 10,  6, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        7, 10, 10, 10, 10, 10, 10,  9, 10, 10,  7, 10, 10, 10,  7,  7, 10,
       10, 10,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10])

In [11]:
#######################
#### RANDOM FOREST ####
#######################


In [349]:
#Building off decision tree!

import numpy as np
from scipy import stats

#running discrete random forests on iris dataset (majority voting) with 100 trees

def sample(num_sample):
    '''gathers n samples (data and label) from the dataset with replacement'''
    sampled_data = data.sample(n=num_sample, replace=True)
    sampled_label = label.ix[sampled_data.index.values.tolist()]
    return sampled_data, sampled_label

def train_tree(data, label):
    '''trains a decision tree on some data'''
    #writing model
    y = label_1
    X = data_1
    dt = DecisionTreeClassifier(min_samples_split=5, random_state=99) #abstracted away
    dt.fit(X, y)
    return dt

def eval_tree(tree, data):
    '''runs the tree on some data and returns the output'''
    result = tree.apply(data)
    return result
    
def combine_discrete(outputs):
    '''returns majority vote of outputs (array of results)'''
    mode = stats.mode(outputs)[0]
    return mode

In [350]:
#data is found in df2["Target"] and df2[features]

label = df2["Target"]
data = df2[features]

In [351]:
#some constants
num_trees = 100
num_samples_per_tree = 15

#creating decision trees for each sample and storing them in an array of classifiers
classifiers = []

#creating one dt and test it, then store it in the array.
data_1, label_1 = sample(15)
tree_1 = train_tree(data_1, label_1)

classifiers += [tree_1]

In [352]:
#working for one. now for all the trees!
classifiers = []

#train num_trees decision trees!
for i in range(num_trees):
    data_i, label_i = sample(15)
    tree_i = train_tree(data_i, label_i)
    classifiers += [tree_i]
    tree_i = 0

In [353]:
#combining outputs from decision trees evaluated on all data, tested on two
results = []

for tree in classifiers:
    results += [np.array(eval_tree(tree, data))]

In [354]:
#result from majority vote
combine_discrete(outputs=results)

array([[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4,
        3, 4, 4, 3, 3, 4, 4, 4, 4, 3, 4, 3, 4, 3, 4, 4, 3, 3, 4, 4, 4, 4,
        4, 3, 4, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4, 3, 3, 3, 4, 3]])