In [1]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.preprocessing import StandardScaler

In [2]:
nflgames = pd.read_csv('nflfull.csv') # read CSV file generated in datacollection.py

In [3]:
new = nflgames[['Home Win?', 'Home PF', 'Home PA', 'Home Wins to Date', 'Home Losses to Date', 'Home Ties to Date ', 'Away PF', 'Away PA', 'Away Wins to Date',
                'Away Losses to Date', 'Away Ties to Date ', 'Road Closing Spread', 'Home Closing Spread']] # choose a subset of columns for analysis

In [4]:
x = new.drop('Home Win?', 1)  # Feature Matrix
y = new['Home Win?']  # target variable

In [5]:
print(x.isnull().sum())  # no missing values in any of the columns

Home PF                0
Home PA                0
Home Wins to Date      0
Home Losses to Date    0
Home Ties to Date      0
Away PF                0
Away PA                0
Away Wins to Date      0
Away Losses to Date    0
Away Ties to Date      0
Road Closing Spread    0
Home Closing Spread    0
dtype: int64


In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=0) # split into training and test sets

In [11]:
print(x_train.shape, x_test.shape)  # shows shape of train and test set

(2009, 12) (503, 12)


# Step Forward Feature Selection

In [12]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1),
          k_features=(1, 12),
          forward=True,
          floating=False,
          verbose=2,
          scoring='accuracy',
          cv=4,
          n_jobs=-1).fit(x_train, y_train)

print(f'\n {sfs.k_feature_names_}, {sfs.k_score_}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    3.7s remaining:    7.5s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.9s finished

[2020-05-03 23:01:56] Features: 1/12 -- score: 0.7570929007627543[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    1.3s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    1.8s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    2.6s finished

[2020-05-03 23:01:58] Features: 2/12 -- score: 0.7570929007627543[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    1.8s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.5s finished

[2020-05-03 23:02:01] Features: 3/12 -- score: 0.75261478935154[Parallel(n_jobs=-1)]: Using backend Lo


 ('Home Losses to Date',), 0.7570929007627543


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.7s finished

[2020-05-03 23:02:15] Features: 12/12 -- score: 0.7242382359231068

The best model with stepwise forward selection is the model with 'Home Losses to Date' as the sole predictor. The prediction score is 0.7242382359231068.

# Step Backward Feature Selection

In [15]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1),
          k_features=(1, 12),
          forward=False,
          floating=False,
          verbose=2,
          scoring='accuracy',
          cv=4,
          n_jobs=-1).fit(x_train, y_train)

print(f'\n {sfs.k_feature_names_}, {sfs.k_score_}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    2.1s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    3.7s finished

[2020-05-03 23:07:29] Features: 11/1 -- score: 0.7302113613141867[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    2.0s remaining:    9.3s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    2.4s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    3.5s finished

[2020-05-03 23:07:32] Features: 10/1 -- score: 0.733697417091079[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.4s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.5s finished

[2020-05-03 23:07:36] Features: 9/1 -- score: 0.7346914528763673[Parallel(n_jobs=-1)]: Using backend Lo


 ('Home PF', 'Home PA', 'Home Ties to Date ', 'Away PF', 'Away PA', 'Away Wins to Date', 'Away Ties to Date '), 0.7381715682003596


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    0.7s finished

[2020-05-03 23:07:51] Features: 1/1 -- score: 0.7108039016894647

The best model with stepwise backward selection is the model with 'Home PF', 'Home PA', 'Home Ties to Date ', 'Away PF', 'Away PA', 'Away Wins to Date', and 'Away Ties to Date ' as the predictors. The prediction score is 0.7381715682003596.

# Exhaustive Feature Selection

In [17]:
efs = EFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1),
          min_features=4,
          max_features=5,
          scoring='accuracy',
          cv=None,
          n_jobs=-1).fit(x_train, y_train)

print(f'\n {efs.best_feature_names_}, {efs.best_score_}')

Features: 1287/1287


 ('Home PF', 'Home PA', 'Home Wins to Date', 'Road Closing Spread'), 0.9855649576903932


The best model with exhaustive feature selection is the model with 'Home PF', 'Home PA', 'Home Wins to Date', and 'Road Closing Spread' as the predictors. The prediction score is 0.9855649576903932. This is the best model.