#  Supervised Learning Assignment

####  S. Parker 2016

This notebook will examine the phishing dataset with decision trees.  It will look at the effectiveness of the decision trees relative to:

*  Levels in the decision tree
*  Number of training samples
*  Bias vs variance calculations
*  Performance metrics (memory used, time to fit, time to predict)

Also explored will be the effect of pre-pruning the data set and how this affects performance.

In [4]:
from sklearn import tree
from sklearn import ensemble
from sklearn import svm
from sklearn import neighbors
from sklearn.externals.six import StringIO  
import pandas as pd
import pydotplus
import numpy as np
import os, sys
import timeit
from IPython.display import display, HTML
from IPython.display import Image 
import matplotlib.pyplot as plt
%matplotlib inline
import arff

ImportError: No module named 'pydotplus'

####  Control variables for simulation

In [None]:
training_test_split = .65

####  Helper function to allow us to get the size of the learning model

In [None]:
import sys
from numbers import Number
from collections import Set, Mapping, deque

try: # Python 2
    zero_depth_bases = (basestring, Number, xrange, bytearray)
    iteritems = 'iteritems'
except NameError: # Python 3
    zero_depth_bases = (str, bytes, Number, range, bytearray)
    iteritems = 'items'

def getsize(obj_0):
    """Recursively iterate to sum size of object & members."""
    def inner(obj, _seen_ids = set()):
        obj_id = id(obj)
        if obj_id in _seen_ids:
            return 0
        _seen_ids.add(obj_id)
        size = sys.getsizeof(obj)
        if isinstance(obj, zero_depth_bases):
            pass # bypass remaining control flow and return
        elif isinstance(obj, (tuple, list, Set, deque)):
            size += sum(inner(i) for i in obj)
        elif isinstance(obj, Mapping) or hasattr(obj, iteritems):
            size += sum(inner(k) + inner(v) for k, v in getattr(obj, iteritems)())
        # Check for custom object instances - may subclass above too
        if hasattr(obj, '__dict__'):
            size += inner(vars(obj))
        if hasattr(obj, '__slots__'): # can have __slots__ with __dict__
            size += sum(inner(getattr(obj, s)) for s in obj.__slots__ if hasattr(obj, s))
        return size
    return inner(obj_0)

##  Load and prepare training set #1

For this assignment I choose the following two data sets:
    
*  Phishing data set - looks at various attributes of a website address and compute whether the site if a phishing site or not

In [None]:
#  Load ARFF file
dataset_1_name = "Phishing"

arff_all = arff.load(open('datasets/phishing/Training Dataset.arff.txt'), 'rb')

#  Put data into dataframe
df_1_all = pd.DataFrame(arff_all["data"], columns=pd.DataFrame(arff_all["attributes"])[0])

#  Split into training and testing sets
split_ratio = .65
split_point = int(len(df_1_all) * split_ratio)

df_training = df_1_all[0:split_point]
df_testing  = df_1_all[split_point:]

In [None]:
df_1_all.head()

##  Vary Decision Tree Depth to see effect on training and testing sets

In [None]:
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)

In [None]:
clf.fit(df_training.values[:,0:-2], df_training.values[:,-1])

In [None]:
dot_data = StringIO()  
tree.export_graphviz(clf, out_file=dot_data,  
                     feature_names=df_training.columns.values,  
                     class_names=["False", "True"],  
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png()) 

In [None]:
training_accuracy = []
testing_accuracy = []
fit_run_time = []
predict_run_time= []
memory_usage = []

depth_range = range(1,25)

for depth in depth_range:
    clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=depth)
    fit_run_time.append(timeit.timeit("clf.fit(df_training.values[:,0:-2], df_training.values[:,-1])", 
                    "from __main__ import clf, df_training",
                     number = 5))
    
    memory_usage.append(getsize(clf))
    
    training_accuracy.append(clf.score(df_training.values[:,0:-2], df_training.values[:,-1]))
    testing_accuracy.append(clf.score(df_testing.values[:,0:-2], df_testing.values[:,-1]))
    
    predict_run_time.append(timeit.timeit("clf.score(df_testing.values[:,0:-2], df_testing.values[:,-1])",
                                          "from __main__ import clf, df_testing",
                                          number=5))
    
plt.figure(figsize=(20,10))    
    
plt.subplot(221)
plt.plot(depth_range, training_accuracy)
plt.plot(depth_range, testing_accuracy)
plt.title("Phishing Dataset - Depth vs. Accuracy (Entropy)")
plt.xlabel("Decision Tree Depth")
plt.ylabel("Accuracy")
plt.legend(["Training Set", "Testing Set"], loc=2)

plt.subplot(222)
plt.title("Phishing Dataset - Depth vs. Fit Runtime")
plt.plot(depth_range, np.array(fit_run_time) * 1000)
plt.ylabel("Run Time (milliseconds)")
plt.xlabel("Decision Tree Depth")

plt.subplot(223)
plt.title("Phishing Dataset - Depth vs. Predict Runtime On Test Set")
plt.plot(depth_range, np.array(predict_run_time) * 1000)
plt.ylabel("Run Time (milliseconds)")
plt.xlabel("Decision Tree Depth")

plt.subplot(224)
plt.title("Phishing Dataset - Depth vs. Predict Runtime On Test Set")
plt.plot(depth_range, memory_usage)
plt.ylabel("Run Time (milliseconds)")
plt.xlabel("Decision Tree Depth")

In [None]:
training_accuracy = []
testing_accuracy = []

percent_of_training_data = np.arange(.05,1.0,.05)

for percent in percent_of_training_data:
    clf = tree.DecisionTreeClassifier(criterion="entropy")
#    clf = ensemble.AdaBoostClassifier()
#    clf = svm.SVC()
#    clf = neighbors.NearestNeighbors()
    clf.fit(df_training.values[0:int(percent * len(df_training)),0:-2], \
            df_training.values[0:int(percent * len(df_training)),-1])
    
    training_accuracy.append(clf.score(df_training.values[:,0:-2], df_training.values[:,-1]))
    testing_accuracy.append(clf.score(df_testing.values[:,0:-2], df_testing.values[:,-1]))
    
    predict_run_time.append(timeit.timeit("clf.score(df_testing.values[:,0:-2], df_testing.values[:,-1])",
                                          "from __main__ import clf, df_testing",
                                          number=5))
    
plt.plot(percent_of_training_data, training_accuracy)
plt.plot(percent_of_training_data, testing_accuracy)
plt.title("Phishing Dataset - Training Data Samples vs. Accuracy (Entropy)")
plt.xlabel("Fraction of Training Data Sample Set")
plt.ylabel("Accuracy")
plt.legend(["Training Set", "Testing Set"], loc=2)

In [None]:
training_accuracy = []
testing_accuracy = []

percent_of_training_data = np.arange(.05,1.0,.05)

for percent in percent_of_training_data:
    clf = tree.DecisionTreeClassifier(criterion="gini")
    clf.fit(df_training.values[0:int(percent * len(df_training)),0:-2], \
            df_training.values[0:int(percent * len(df_training)),-1])
    
    training_accuracy.append(clf.score(df_training.values[:,0:-2], df_training.values[:,-1]))
    testing_accuracy.append(clf.score(df_testing.values[:,0:-2], df_testing.values[:,-1]))
    
plt.plot(percent_of_training_data, training_accuracy)
plt.plot(percent_of_training_data, testing_accuracy)
plt.title("Phishing Dataset - Training Data Samples vs. Accuracy (Entropy)")
plt.xlabel("Fraction of Training Data Sample Set")
plt.ylabel("Accuracy")
plt.legend(["Training Set", "Testing Set"], loc=2)

In [None]:
getsize(clf)