In [42]:
#decision trees:
'''
They are constructed with 'training data' (supervised), then they can be evaluated on new data. 

"The preferred sequence of attributes to investigate to most rapidly narrow down the state of X"

Decision trees prone to overfitting: so we prune them!
    (this involves replacing some next-to-leave nodes and their children with a single node)
Deals with continuous/discrete split by thresholding continuous data into discrete bins
Uses information gain to construct tree 
    (KL divergence between prior and posterior when splitting).
    IOW finds highest KL divergence between distributions split
    between branches of the tree to decide where to split the tree!
    
    
    
Names: 
CART Classification and Regression Trees 
    These have binary splits on the nodes, not multiclass
Regression Tree: decision tree where target variables can take on continuous variables




Algorithm implementation:
1. Define some cost function
    -Variance reduction
    -Information gain (KL divergence)
    -Gini impurity
2. At each step in the tree, grid search all splits to find one that minimizes the cost function
3. Continue until hit some threshold number of splits, or reduced tree to leaves (single nodes at ends)
4. If necessary, prune to reduce overfitting



#They can be inspected! Not a black box model!

Note bias-variance tradeoff:

Underfitting: High bias, low variance (captures wrong 'location' of data)
Overfitting: Low bias, high variance (captures noise in data)

'''

'\nThey are constructed with \'training data\' (supervised), then they can be evaluated on new data. \n\n"The preferred sequence of attributes to investigate to most rapidly narrow down the state of X"\n\nDecision trees prone to overfitting: so we prune them!\n    (this involves replacing some next-to-leave nodes and their children with a single node)\nDeals with continuous/discrete split by thresholding continuous data into discrete bins\nUses information gain to construct tree \n    (KL divergence between prior and posterior when splitting).\n    IOW finds highest KL divergence between distributions split\n    between branches of the tree to decide where to split the tree!\n    \n    \n    \nNames: \nCART Classification and Regression Trees \n    These have binary splits on the nodes, not multiclass\nRegression Tree: decision tree where target variables can take on continuous variables\n\n\n\n\nAlgorithm implementation:\n1. Define some cost function\n    -Variance reduction\n    -Inf

In [43]:
#Implementation w/ sklearn

from __future__ import print_function

import os
import subprocess

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [44]:
#pulling sample data
def get_iris_data():
    """Get the iris data, from local csv or pandas repo."""
    if os.path.exists("iris.csv"):
        print("-- iris.csv found locally")
        df = pd.read_csv("iris.csv", index_col=0)
    else:
        print("-- trying to download from github")
        fn = "https://raw.githubusercontent.com/pydata/pandas/" + \
             "master/pandas/tests/data/iris.csv"
        try:
            df = pd.read_csv(fn)
        except:
            exit("-- Unable to download iris.csv")

        with open("iris.csv", 'w') as f:
            print("-- writing to local iris.csv file")
            df.to_csv(f)

    return df

df = get_iris_data()

In [45]:
#printing some of the data
print("* df.head()", df.head(), sep="\n", end="\n\n")
print("* df.tail()", df.tail(), sep="\n", end="\n\n")

* df.head()
   SepalLength  SepalWidth  PetalLength  PetalWidth         Name
0          5.1         3.5          1.4         0.2  Iris-setosa
1          4.9         3.0          1.4         0.2  Iris-setosa
2          4.7         3.2          1.3         0.2  Iris-setosa
3          4.6         3.1          1.5         0.2  Iris-setosa
4          5.0         3.6          1.4         0.2  Iris-setosa

* df.tail()
     SepalLength  SepalWidth  PetalLength  PetalWidth            Name
145          6.7         3.0          5.2         2.3  Iris-virginica
146          6.3         2.5          5.0         1.9  Iris-virginica
147          6.5         3.0          5.2         2.0  Iris-virginica
148          6.2         3.4          5.4         2.3  Iris-virginica
149          5.9         3.0          5.1         1.8  Iris-virginica



In [48]:
#classes
print("* iris types:", df["Name"].unique(), sep="\n")

* iris types:
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [51]:
#encoding 
def encode_target(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

In [52]:
#print out data
df2, targets = encode_target(df, "Name")
print("* df2.head()", df2[["Target", "Name"]].head(),
      sep="\n", end="\n\n")
print("* df2.tail()", df2[["Target", "Name"]].tail(),
      sep="\n", end="\n\n")
print("* targets", targets, sep="\n", end="\n\n")


#print feature column names
features = list(df2.columns[:4])
print("* features:", features, sep="\n")

* df2.head()
   Target         Name
0       0  Iris-setosa
1       0  Iris-setosa
2       0  Iris-setosa
3       0  Iris-setosa
4       0  Iris-setosa

* df2.tail()
     Target            Name
145       2  Iris-virginica
146       2  Iris-virginica
147       2  Iris-virginica
148       2  Iris-virginica
149       2  Iris-virginica

* targets
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']

* features:
['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']


In [None]:
###############
#### MODEL ####
###############

In [53]:
#writing model
y = df2["Target"]
X = df2[features]
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99) #abstracted away
dt.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=20, min_weight_fraction_leaf=0.0,
            presort=False, random_state=99, splitter='best')

In [54]:
#saving visualization
def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")
        
        
visualize_tree(dt, features)

In [55]:
#output matrix
dt.apply(df2[features])

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  9,  5,  5,  5,  5,  5,  5,  7,  5,  5,  5,  5,  5,  7,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10, 10,
       10, 10, 10, 10,  6, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        7, 10, 10, 10, 10, 10, 10,  9, 10, 10,  7, 10, 10, 10,  7,  7, 10,
       10, 10,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10])