# Random Forest - PCA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import classification_report,confusion_matrix #import Confusion Matrix
from sklearn.model_selection import train_test_split # Splitting the data
from sklearn import preprocessing # Normalizing

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import time




In [2]:
df_o = pd.read_csv("../data/processed/data_allcolumns.csv",index_col=0)

In [3]:
# Normalizing the data
x = df_o.values #returns a numpy array
col = df_o.columns
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_n = pd.DataFrame(x_scaled, columns = col)

# df_n

In [4]:
# Separate majority and minority classes
df_majority = df_o[df_o['isFirstDown']==0]
df_minority = df_o[df_o['isFirstDown']==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,                  # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123)              # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled['isFirstDown'].value_counts()

1    22532
0    22532
Name: isFirstDown, dtype: int64

In [5]:
X = df_o.drop("isFirstDown",1)   #Feature Matrix
y = df_o["isFirstDown"]          #Target Variable

In [6]:
# from sklearn.model_selection import train_test_split # Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=72)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(19957, 32)
(8554, 32)
(19957,)
(8554,)


In [7]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 15) # Where to change the number of PCA variables
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [9]:
explained_variance = pca.explained_variance_ratio_
print(explained_variance)
print (np.count_nonzero(explained_variance))

[0.07692644 0.07434834 0.06821839 0.05718667 0.05362709 0.05189028
 0.0472466  0.04384965 0.03838543 0.0352955  0.0347862  0.03317792
 0.03257169 0.03175068 0.02990104]
15


In [10]:
for i in range(0,np.count_nonzero(explained_variance + 1)):
    print(i," ", sum(explained_variance[0:i]))

0   0
1   0.07692643737670984
2   0.1512747809988463
3   0.21949317207100058
4   0.27667984662999845
5   0.33030693285877494
6   0.3821972149569151
7   0.42944381023939804
8   0.47329346252193527
9   0.5116788909329493
10   0.5469743956534501
11   0.581760596318623
12   0.614938517884652
13   0.6475102108901091
14   0.6792608953529834


In [11]:
# Create Decision Tree classifer object
rfc = RandomForestClassifier(n_estimators=10,criterion="entropy", max_depth=4)

In [12]:
# Train Decision Tree Classifer
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
#Predict the response for test dataset
y_pred = rfc.predict(X_test)

In [14]:
accText = 'Random Forest, PCA'

In [15]:
# Model Accuracy, how often is the classifier correct?

# accuracy: (tp + tn) / (p + n)
accuracy = (accuracy_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Accuracy',accuracy))
# precision tp / (tp + fp)
precision = (precision_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Precision',precision))
# recall: tp / (tp + fn)
recall = (recall_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('Recall',recall))
# f1: 2 tp / (2 tp + fp + fn)
f1 = (f1_score(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2%}'.format('F1 score',f1))
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = (auc(false_positive_rate, true_positive_rate)).astype('float64')
print('{:>10}: {:0.2%}'.format('ROC score',roc_auc))
# Root Mean Square Error
from sklearn.metrics import mean_squared_error
rmse = (mean_squared_error(y_test, y_pred)).astype('float64')
print('{:>10}: {:0.2}'.format('RMSE',rmse))

acc = pd.read_csv("../data/external/accuracies.csv", index_col=0)
acc.at[accText, 'Accuracy'] = (accuracy)
acc.at[accText, 'Precision'] = (precision)
acc.at[accText, 'Recall'] = (recall)
acc.at[accText, 'F1'] = (f1)
acc.at[accText, 'ROC'] = (roc_auc)
acc.at[accText, 'RMSE'] = (rmse)
acc.to_csv("../data/external/accuracies.csv")

  Accuracy: 79.38%
 Precision: 52.17%
    Recall: 0.68%
  F1 score: 1.34%
 ROC score: 50.26%
      RMSE: 0.21


In [16]:
#Present Confusion Matrix to show accuracy
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred, labels=[1,0]))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Confusion Matrix:

[[  12 1753]
 [  11 6778]]

Classification Report:

              precision    recall  f1-score   support

           0       0.79      1.00      0.88      6789
           1       0.52      0.01      0.01      1765

    accuracy                           0.79      8554
   macro avg       0.66      0.50      0.45      8554
weighted avg       0.74      0.79      0.71      8554

