In [1]:
# See: http://scikit-learn.org/stable/modules/tree.html

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris # test dataset to run classification
from sklearn import tree               # sklearn tree classifier


In [2]:
iris = load_iris() # a dictionary of data/metadata for iris dataset

In [3]:
iris

 'data': array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.9,  3. ,  1.4,  0.2],
        [ 4.7,  3.2,  1.3,  0.2],
        [ 4.6,  3.1,  1.5,  0.2],
        [ 5. ,  3.6,  1.4,  0.2],
        [ 5.4,  3.9,  1.7,  0.4],
        [ 4.6,  3.4,  1.4,  0.3],
        [ 5. ,  3.4,  1.5,  0.2],
        [ 4.4,  2.9,  1.4,  0.2],
        [ 4.9,  3.1,  1.5,  0.1],
        [ 5.4,  3.7,  1.5,  0.2],
        [ 4.8,  3.4,  1.6,  0.2],
        [ 4.8,  3. ,  1.4,  0.1],
        [ 4.3,  3. ,  1.1,  0.1],
        [ 5.8,  4. ,  1.2,  0.2],
        [ 5.7,  4.4,  1.5,  0.4],
        [ 5.4,  3.9,  1.3,  0.4],
        [ 5.1,  3.5,  1.4,  0.3],
        [ 5.7,  3.8,  1.7,  0.3],
        [ 5.1,  3.8,  1.5,  0.3],
        [ 5.4,  3.4,  1.7,  0.2],
        [ 5.1,  3.7,  1.5,  0.4],
        [ 4.6,  3.6,  1. ,  0.2],
        [ 5.1,  3.3,  1.7,  0.5],
        [ 4.8,  3.4,  1.9,  0.2],
        [ 5. ,  3. ,  1.6,  0.2],
        [ 5. ,  3.4,  1.6,  0.4],
        [ 5.2,  3.5,  1.5,  0.2],
        [ 5.2,  3.4,  1.4,  0.2],
      

In [4]:
## Define sklearn decision tree
# Parameters:
# criterion=’gini’ (gini coefficient) or 'entropy' (information gain) 
# splitter=’best’, 'random'
# max_depth=None, (max depth of the tree--num of steps from root to leaf node)
# min_samples_split=2, (min num. of samples required to do a split)
# min_samples_leaf=1, (min. num. of samples required to make a leaf node)
# min_weight_fraction_leaf=0.0, 
# max_features=None, 
# random_state=None, (used to seed random number generator)
# max_leaf_nodes=None, 
# min_impurity_decrease=0.0, 
# min_impurity_split=None, 
# class_weight=None, 
# presort=False

clf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=5)

In [5]:
import random

In [6]:
help(random.sample)

Help on method sample in module random:

sample(self, population, k) method of random.Random instance
    Chooses k unique random elements from a population sequence.
    
    Returns a new list containing elements from the population while
    leaving the original population unchanged.  The resulting list is
    in selection order so that all sub-slices will also be valid random
    samples.  This allows raffle winners (the sample) to be partitioned
    into grand prize and second place winners (the subslices).
    
    Members of the population need not be hashable or unique.  If the
    population contains repeats, then each occurrence is a possible
    selection in the sample.
    
    To choose a sample in a range of integers, use xrange as an argument.
    This is especially fast and space efficient for sampling from a
    large population:   sample(xrange(10000000), 60)



In [7]:
# Split the iris dataset into training and testing data:

tr_idx = random.sample(range(len(iris.data)), int(len(iris.data)*0.8))
tr_data = iris.data[tr_idx]     # training data
tr_target = iris.target[tr_idx] # training prediction target

te_data = iris.data[np.setdiff1d(range(len(iris.data)), tr_idx)]     # test data
te_target = iris.target[np.setdiff1d(range(len(iris.data)), tr_idx)] # test prediction target

In [8]:
# Fitting the model
# iris.data = array of feature variable values
# iris.target = prediction target values

clf = clf.fit(tr_data, tr_target) # inputs and outputs should be in array-format

In [9]:
# let's try to predict the type of iris for
names = np.array(iris.target_names)

results = pd.DataFrame({"Prediction" : names[clf.predict(te_data)], "Truth" : names[te_target]})
results['Correct'] = results.apply(lambda x: x.Prediction==x.Truth, axis=1)
results

Unnamed: 0,Prediction,Truth,Correct
0,setosa,setosa,True
1,setosa,setosa,True
2,setosa,setosa,True
3,setosa,setosa,True
4,setosa,setosa,True
5,setosa,setosa,True
6,setosa,setosa,True
7,setosa,setosa,True
8,versicolor,versicolor,True
9,versicolor,versicolor,True


In [10]:
# overall misclassification rate:

print "Misclassification rate: %3.2f%%" % (float(sum(results.Correct==False))/float(len(results))*100)

Misclassification rate: 10.00%


In [11]:
# Accuracy for each iris species

results.groupby('Truth').apply(lambda x: float(sum(x.Correct))/float(len(x.Correct)))

Truth
setosa        1.000000
versicolor    0.909091
virginica     0.818182
dtype: float64

In [12]:
# creating a tree visualization:

tree.export_graphviz(clf, out_file='iris.dat', 
                     feature_names=iris.feature_names, 
                     class_names=iris.target_names,
                     filled=True)

In [13]:
%%sh 

more iris.dat # copy and paste text into http://www.webgraphviz.com/

::::::::::::::
iris.dat
::::::::::::::
digraph Tree {
node [shape=box, style="filled", color="black"] ;
0 [label="petal width (cm) <= 0.8\nentropy = 1.5841\nsamples = 120\nvalue = [42, 39, 39]\nclass = setosa", fillcolor="#e5813909"] ;
1 [label="entropy = 0.0\nsamples = 42\nvalue = [42, 0, 0]\nclass = setosa", fillcolor="#e58139ff"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="petal width (cm) <= 1.75\nentropy = 1.0\nsamples = 78\nvalue = [0, 39, 39]\nclass = versicolor", fillcolor="#39e58100"] ;
0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
3 [label="petal width (cm) <= 1.45\nentropy = 0.3712\nsamples = 42\nvalue = [0, 39, 3]\nclass = versicolor", fillcolor="#39e581eb"] ;
2 -> 3 ;
4 [label="entropy = 0.0\nsamples = 28\nvalue = [0, 28, 0]\nclass = versicolor", fillcolor="#39e581ff"] ;
3 -> 4 ;
5 [label="sepal length (cm) <= 5.15\nentropy = 0.7496\nsamples = 14\nvalue = [0, 11, 3]\nclass = versicolor", fillcolor="#39e581b9"] ;
3 -> 5 ;
6 [l