# Decision Tree - PCA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier #Import scikit-Tree For Decision Tree
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import classification_report,confusion_matrix #import Confusion Matrix
from sklearn.model_selection import train_test_split # Splitting the data
from sklearn import preprocessing # Normalizing

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import time




In [2]:
df_o = pd.read_csv("../data/processed/data_allcolumns.csv",index_col=0)

In [3]:
# Normalizing the data
x = df_o.values #returns a numpy array
col = df_o.columns
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_n = pd.DataFrame(x_scaled, columns = col)

# df_n

In [4]:
# Separate majority and minority classes
df_majority = df_o[df_o['isFirstDown']==0]
df_minority = df_o[df_o['isFirstDown']==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,                  # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123)              # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled['isFirstDown'].value_counts()

1    22532
0    22532
Name: isFirstDown, dtype: int64

In [5]:
X = df_o.drop("isFirstDown",1)   #Feature Matrix
y = df_o["isFirstDown"]          #Target Variable

In [6]:
# from sklearn.model_selection import train_test_split # Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=72)

numDimensions = X_test.shape[1]

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(19957, 32)
(8554, 32)
(19957,)
(8554,)


In [7]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 18) # Where to change the number of PCA variables
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [9]:
explained_variance = pca.explained_variance_ratio_
print(explained_variance)
print (np.count_nonzero(explained_variance))

[0.07692646 0.07434848 0.0682184  0.05718669 0.05362752 0.05189172
 0.04725129 0.04384966 0.0383857  0.03529685 0.03478507 0.03318251
 0.03257513 0.03176543 0.02993404 0.02903532 0.02842683 0.02779567]
18


In [10]:
for i in range(0,np.count_nonzero(explained_variance + 1)):
    print(i," ", sum(explained_variance[0:i]))

0   0
1   0.07692645841424905
2   0.15127493989347676
3   0.21949333973173651
4   0.27668003425897564
5   0.3303075568414196
6   0.3821992779198937
7   0.42945056763533473
8   0.47330022916236114
9   0.5116859285443672
10   0.5469827821109374
11   0.5817678500487853
12   0.6149503601783621
13   0.6475254937779998
14   0.6792909284462771
15   0.7092249658096974
16   0.7382602874139399
17   0.7666871221190371


In [11]:
# Create Decision Tree classifer object
dtree = DecisionTreeClassifier(criterion="entropy")

In [12]:
# Train Decision Tree Classifer
dtree.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [13]:
#Predict the response for test dataset
y_pred = dtree.predict(X_test)

In [14]:
accText = 'Decision Tree, PCA'

In [15]:
# Model Accuracy, how often is the classifier correct?

# accuracy: (tp + tn) / (p + n)
accuracy = (accuracy_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Accuracy',accuracy))
# precision tp / (tp + fp)
precision = (precision_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Precision',precision))
# recall: tp / (tp + fn)
recall = (recall_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Recall',recall))
# f1: 2 tp / (2 tp + fp + fn)
f1 = (f1_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('F1 score',f1))
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = (auc(false_positive_rate, true_positive_rate)).astype('float64')
print('{:>10}: {:0.2%}'.format('ROC score',roc_auc))
# Root Mean Square Error
from sklearn.metrics import mean_squared_error
rmse = (mean_squared_error(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2}'.format('RMSE',rmse))

acc = pd.read_csv("../data/external/accuracies.csv", index_col=0)
acc.at[accText, 'Accuracy'] = (accuracy)
acc.at[accText, 'Precision'] = (precision)
acc.at[accText, 'Recall'] = (recall)
acc.at[accText, 'F1'] = (f1)
acc.at[accText, 'ROC'] = (roc_auc)
acc.at[accText, 'RMSE'] = (rmse)
acc.to_csv("../data/external/accuracies.csv")

  Accuracy: 72.31%
 Precision: 34.16%
    Recall: 36.88%
  F1 score: 35.47%
 ROC score: 59.20%
      RMSE: 0.28


In [16]:
#Present Confusion Matrix to show accuracy
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred, labels=[1,0]))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Confusion Matrix:

[[ 651 1114]
 [1255 5534]]

Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.82      0.82      6789
           1       0.34      0.37      0.35      1765

    accuracy                           0.72      8554
   macro avg       0.59      0.59      0.59      8554
weighted avg       0.73      0.72      0.73      8554



In [17]:
features = list(X_train)
#features

In [18]:
# List of values to try for max_depth:
max_depth_range = list(range(1, numDimensions + 1))# List to store the average RMSE for each value of max_depth:
accuracy = []
for depth in max_depth_range:
    
    clf = DecisionTreeClassifier(max_depth = depth, random_state = 0)
    clf.fit(X_train, y_train)    
    score = clf.score(X_test, y_test)
    accuracy.append(score)
    
pd.DataFrame(accuracy)

Unnamed: 0,0
0,0.793664
1,0.799977
2,0.801964
3,0.79834
4,0.801964
5,0.801613
6,0.801263
7,0.797405
8,0.797288
9,0.793664
