# Decision Tree - Oversampling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier #Import scikit-Tree For Decision Tree
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import classification_report,confusion_matrix #import Confusion Matrix
from sklearn.model_selection import train_test_split # Splitting the data
from sklearn import preprocessing # Normalizing

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import time




In [2]:
df_o = pd.read_csv("../data/processed/data_oversampled.csv",index_col=0)

In [3]:
# Normalizing the data
x = df_o.values #returns a numpy array
col = df_o.columns
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_n = pd.DataFrame(x_scaled, columns = col)

# df_n

In [4]:
X = df_o.drop("isFirstDown",1)   #Feature Matrix
y = df_o["isFirstDown"]          #Target Variable

In [5]:
# from sklearn.model_selection import train_test_split # Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=72)

numDimensions = X_test.shape[1]

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(31544, 27)
(13520, 27)
(31544,)
(13520,)


In [6]:
# Create Decision Tree classifer object
dtree = DecisionTreeClassifier(criterion="entropy")

In [7]:
# Train Decision Tree Classifer
dtree.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [8]:
#Predict the response for test dataset
y_pred = dtree.predict(X_test)

In [9]:
accText = 'Decision Tree, Oversampled'

In [10]:
# Model Accuracy, how often is the classifier correct?

# accuracy: (tp + tn) / (p + n)
accuracy = (accuracy_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Accuracy',accuracy))
# precision tp / (tp + fp)
precision = (precision_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Precision',precision))
# recall: tp / (tp + fn)
recall = (recall_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Recall',recall))
# f1: 2 tp / (2 tp + fp + fn)
f1 = (f1_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('F1 score',f1))
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = (auc(false_positive_rate, true_positive_rate)).astype('float64')
print('{:>10}: {:0.2%}'.format('ROC score',roc_auc))
# Root Mean Square Error
from sklearn.metrics import mean_squared_error
rmse = (mean_squared_error(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2}'.format('RMSE',rmse))

acc = pd.read_csv("../data/external/accuracies.csv", index_col=0)
acc.at[accText, 'Accuracy'] = (accuracy)
acc.at[accText, 'Precision'] = (precision)
acc.at[accText, 'Recall'] = (recall)
acc.at[accText, 'F1'] = (f1)
acc.at[accText, 'ROC'] = (roc_auc)
acc.at[accText, 'RMSE'] = (rmse)
acc.to_csv("../data/external/accuracies.csv")

  Accuracy: 88.19%
 Precision: 83.21%
    Recall: 95.96%
  F1 score: 89.13%
 ROC score: 88.11%
      RMSE: 0.12


In [11]:
#Present Confusion Matrix to show accuracy
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred, labels=[1,0]))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Confusion Matrix:

[[6549  276]
 [1321 5374]]

Classification Report:

              precision    recall  f1-score   support

           0       0.95      0.80      0.87      6695
           1       0.83      0.96      0.89      6825

    accuracy                           0.88     13520
   macro avg       0.89      0.88      0.88     13520
weighted avg       0.89      0.88      0.88     13520



In [12]:
features = list(X)
#features

In [20]:
dot_data = StringIO()  
export_graphviz(dtree, out_file=dot_data, feature_names=features, filled=True, \
                rounded=True, rotate=True, leaves_parallel=True, impurity=True, precision=3, max_depth=3)

import pydot
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph[0].create_png(), width=3600, height=7200)
graph[0].write_png('../reports/figures/DecisionTreeOversampled.png')

In [14]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [ \
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" \
        for i in tree_.feature \
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)
    
tree_to_code(dtree, features)

def tree(LineBackers, A, isHomeTeam, Distance, PlayerHeightInches, TimeSnapDiff, S, TightEnds, YardsToTouchdown, Season, TurfCode, Safeties, StadiumTypeCode, Age, IsPossTeamInLead, NoseTackles, isPlayDirectionLeft, Centers, Cornerbacks, DefensiveEnds, OffensiveGuards, HomeScoreBeforePlay, Y, Down, DefensiveTackles, Quarter, DefendersInTheBox):
  if Distance <= 5.5:
    if Distance <= 2.5:
      if A <= 1.8550000190734863:
        if S <= 4.120000123977661:
          if A <= 1.5149999856948853:
            if Distance <= 1.5:
              if Y <= 22.445000648498535:
                return [[9. 0.]]
              else:  # if Y > 22.445000648498535
                if Season <= 2017.5:
                  if HomeScoreBeforePlay <= 32.0:
                    if NoseTackles <= 0.5:
                      if HomeScoreBeforePlay <= 18.5:
                        if Y <= 30.914999961853027:
                          if isPlayDirectionLeft <= 0.5:
                            if HomeScoreBeforePlay <

                                                                return [[ 0. 10.]]
                                                          else:  # if HomeScoreBeforePlay > 8.5
                                                            return [[ 0. 50.]]
                                                      else:  # if Centers > 2.5
                                                        return [[1. 0.]]
                                          else:  # if S > 5.015000104904175
                                            return [[2. 0.]]
                                        else:  # if Y > 30.414999961853027
                                          if OffensiveGuards <= 2.5:
                                            if DefendersInTheBox <= 6.5:
                                              return [[0. 1.]]
                                            else:  # if DefendersInTheBox > 6.5
                                              return [[5. 0.]]
                             

                                    else:  # if HomeScoreBeforePlay > 29.0
                                      return [[1. 0.]]
                              else:  # if A > 3.6399999856948853
                                return [[ 0. 25.]]
                            else:  # if Safeties > 2.5
                              return [[3. 0.]]
                          else:  # if Y > 25.395000457763672
                            if Y <= 26.600000381469727:
                              return [[5. 0.]]
                            else:  # if Y > 26.600000381469727
                              if YardsToTouchdown <= 79.5:
                                if A <= 4.194999933242798:
                                  if Season <= 2018.5:
                                    if A <= 3.774999976158142:
                                      if DefensiveEnds <= 1.5:
                                        return [[6. 0.]]
                                      else:  # if DefensiveEnds > 1.5

                        if S <= 3.7450000047683716:
                          if A <= 2.625:
                            if YardsToTouchdown <= 70.5:
                              if S <= 3.334999918937683:
                                if Age <= 25.5:
                                  if S <= 3.3249999284744263:
                                    if PlayerHeightInches <= 71.5:
                                      return [[23.  0.]]
                                    else:  # if PlayerHeightInches > 71.5
                                      if DefensiveTackles <= 1.5:
                                        if YardsToTouchdown <= 61.5:
                                          return [[9. 0.]]
                                        else:  # if YardsToTouchdown > 61.5
                                          if isHomeTeam <= 0.5:
                                            return [[1. 0.]]
                                          else:  # if isHomeTeam > 0.5
                   

                                              else:  # if S > 5.570000171661377
                                                if DefendersInTheBox <= 6.5:
                                                  return [[0. 2.]]
                                                else:  # if DefendersInTheBox > 6.5
                                                  return [[6. 0.]]
                                            else:  # if Y > 27.605000495910645
                                              return [[0. 4.]]
                                          else:  # if Y > 27.859999656677246
                                            return [[22.  0.]]
                                      else:  # if Age > 21.5
                                        if DefendersInTheBox <= 6.5:
                                          if YardsToTouchdown <= 19.5:
                                            if PlayerHeightInches <= 70.5:
                                              return [[6. 0.]]
    

                                                        if A <= 3.4350000619888306:
                                                          if Age <= 25.5:
                                                            return [[21.  0.]]
                                                          else:  # if Age > 25.5
                                                            if Cornerbacks <= 2.5:
                                                              return [[11.  0.]]
                                                            else:  # if Cornerbacks > 2.5
                                                              if isHomeTeam <= 0.5:
                                                                if Y <= 30.710000038146973:
                                                                  return [[6. 0.]]
                                                                else:  # if Y > 30.710000038146973
                                                                  retu

                                                  if A <= 3.4600000381469727:
                                                    return [[4. 0.]]
                                                  else:  # if A > 3.4600000381469727
                                                    if Quarter <= 2.5:
                                                      return [[0. 9.]]
                                                    else:  # if Quarter > 2.5
                                                      if isHomeTeam <= 0.5:
                                                        return [[0. 1.]]
                                                      else:  # if isHomeTeam > 0.5
                                                        return [[2. 0.]]
                                                else:  # if Age > 28.5
                                                  return [[7. 0.]]
                                              else:  # if S > 4.235000133514404
                         

In [15]:
# The score method returns the accuracy of the model
score = dtree.score(X_test, y_test)
print(score)

0.8818786982248521


In [16]:
# List of values to try for max_depth:
max_depth_range = list(range(1, numDimensions + 1))# List to store the average RMSE for each value of max_depth:
accuracy = []
for depth in max_depth_range:
    
    clf = DecisionTreeClassifier(max_depth = depth, random_state = 0)
    clf.fit(X_train, y_train)    
    score = clf.score(X_test, y_test)
    accuracy.append(score)
    
pd.DataFrame(accuracy).head(10)

Unnamed: 0,0
0,0.721746
1,0.721746
2,0.723669
3,0.730695
4,0.734246
5,0.738462
6,0.741642
7,0.747189
8,0.751553
9,0.760947


In [17]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(clf.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
importances.to_csv("../data/external/importances.csv")
importances.head(5)

Unnamed: 0,feature,importance
3,Distance,0.248
1,A,0.11
6,S,0.108
22,Y,0.086
8,YardsToTouchdown,0.071
