# Decision Tree - PCA

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier #Import scikit-Tree For Decision Tree
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import classification_report,confusion_matrix #import Confusion Matrix
from sklearn.model_selection import train_test_split # Splitting the data
from sklearn import preprocessing # Normalizing

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import time


In [19]:
df_o = pd.read_csv("../data/processed/data_allcolumns.csv",index_col=0)

In [20]:
# Normalizing the data
x = df_o.values #returns a numpy array
col = df_o.columns
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_n = pd.DataFrame(x_scaled, columns = col)

# df_n

In [21]:
# Separate majority and minority classes
df_majority = df_o[df_o['isFirstDown']==0]
df_minority = df_o[df_o['isFirstDown']==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,                  # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123)              # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled['isFirstDown'].value_counts()

1    22532
0    22532
Name: isFirstDown, dtype: int64

In [22]:
X = df_o.drop("isFirstDown",1)   #Feature Matrix
y = df_o["isFirstDown"]          #Target Variable

In [23]:
# from sklearn.model_selection import train_test_split # Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=72)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(19957, 33)
(8554, 33)
(19957,)
(8554,)


In [24]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [25]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 18) # Where to change the number of PCA variables
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [26]:
explained_variance = pca.explained_variance_ratio_
print(explained_variance)
print (np.count_nonzero(explained_variance))

[0.0817825  0.07209724 0.06651052 0.06411815 0.05513938 0.05180673
 0.04622901 0.04252106 0.0372542  0.0343324  0.03375665 0.03220243
 0.03163642 0.03080805 0.02903545 0.02812476 0.0275725  0.02716193]
18


In [27]:
for i in range(0,np.count_nonzero(explained_variance + 1)):
    print(i," ", sum(explained_variance[0:i]))

0   0
1   0.08178250027903043
2   0.1538797393982691
3   0.2203902571804894
4   0.28450840258495974
5   0.33964778696734355
6   0.3914545134641348
7   0.437683520876501
8   0.48020457641856706
9   0.5174587795651115
10   0.5517911839874947
11   0.5855478306681345
12   0.6177502623114015
13   0.649386680640576
14   0.6801947260634101
15   0.7092301776218016
16   0.7373549365961939
17   0.764927439241323


In [28]:
# Create Decision Tree classifer object
dtree = DecisionTreeClassifier(criterion="entropy")

In [29]:
# Train Decision Tree Classifer
dtree.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [30]:
#Predict the response for test dataset
y_pred = dtree.predict(X_test)

In [31]:
accText = 'Decision Tree, Undersampled'

In [32]:
# Model Accuracy, how often is the classifier correct?

# accuracy: (tp + tn) / (p + n)
accuracy = (accuracy_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Accuracy',accuracy))
# precision tp / (tp + fp)
precision = (precision_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Precision',precision))
# recall: tp / (tp + fn)
recall = (recall_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Recall',recall))
# f1: 2 tp / (2 tp + fp + fn)
f1 = (f1_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('F1 score',f1))
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = (auc(false_positive_rate, true_positive_rate)).astype('float64')
print('{:>10}: {:0.2%}'.format('ROC score',roc_auc))

acc = pd.read_csv("../data/external/accuracies.csv", index_col=0)
acc.at[accText, 'Accuracy'] = (accuracy)
acc.at[accText, 'Precision'] = (precision)
acc.at[accText, 'Recall'] = (recall)
acc.at[accText, 'F1'] = (f1)
acc.at[accText, 'ROC'] = (roc_auc)
acc.to_csv("../data/external/accuracies.csv")

  Accuracy: 74.26%
 Precision: 38.40%
    Recall: 40.96%
  F1 score: 39.64%
 ROC score: 61.94%


In [33]:
#Present Confusion Matrix to show accuracy
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred, labels=[1,0]))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Confusion Matrix:

[[ 723 1042]
 [1160 5629]]

Classification Report:

              precision    recall  f1-score   support

           0       0.84      0.83      0.84      6789
           1       0.38      0.41      0.40      1765

    accuracy                           0.74      8554
   macro avg       0.61      0.62      0.62      8554
weighted avg       0.75      0.74      0.75      8554



In [34]:
features = list(X_train)
#features