In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.preprocessing import StandardScaler

# Build Model from 4 Features we Selected via Exhaustive Search

In [5]:
nflgames = pd.read_csv('nflfull.csv')
new = nflgames[['Home Win?', 'Home PF', 'Home PA', 'Home Wins to Date','Road Closing Spread']]
x = new.drop('Home Win?', 1)  # Feature Matrix
y = new['Home Win?']  # target variable

In [6]:
print(x.isnull().sum())  # no missing values in any of the columns

Home PF                0
Home PA                0
Home Wins to Date      0
Road Closing Spread    0
dtype: int64


In [7]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=0)

In [8]:
model = RandomForestClassifier(n_estimators = 100, random_state=0, n_jobs=-1).fit(x_train, y_train)

In [9]:
y_pred = model.predict(x)
y_pred = list(y_pred)
predictions = dict()
predictions['predictions'] = y_pred
print(predictions)

{'predictions': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [10]:
print(len(predictions['predictions'])) # length of predictions is same as length of full CSV

2512


In [11]:
new['Predictions'] = predictions['predictions']
print(new)

      Home Win?  Home PF  Home PA  Home Wins to Date  Road Closing Spread  \
0             1      0.0      0.0                0.0                  5.0   
1             1      0.0      0.0                0.0                  3.0   
2             0      0.0      0.0                0.0                 -3.0   
3             1      0.0      0.0                0.0                  5.0   
4             1      0.0      0.0                0.0                 -2.0   
...         ...      ...      ...                ...                  ...   
2507          0    324.0    417.0                4.0                 -4.0   
2508          1    416.0    398.0                7.0                  1.0   
2509          1    266.0    301.0                6.0                  5.5   
2510          0    332.0    306.0                8.0                  7.0   
2511          1    384.0    372.0               11.0                 -3.5   

      Predictions  
0               1  
1               1  
2              

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['Predictions'] = predictions['predictions']


# Calculate Accuracy of Predictions on all Games from Past 10 Seasons

In [12]:
correct = 0
for i in range(len(new['Home Win?'])):
    if new['Home Win?'][i] == new['Predictions'][i]:
        correct += 1

accuracy = correct / len(new['Home Win?'])
print(accuracy)

0.928343949044586


# See which Weeks Had the Highest Frequency of Incorrect Predictions

In [13]:
new['week'] = nflgames['Week']
print(new)

      Home Win?  Home PF  Home PA  Home Wins to Date  Road Closing Spread  \
0             1      0.0      0.0                0.0                  5.0   
1             1      0.0      0.0                0.0                  3.0   
2             0      0.0      0.0                0.0                 -3.0   
3             1      0.0      0.0                0.0                  5.0   
4             1      0.0      0.0                0.0                 -2.0   
...         ...      ...      ...                ...                  ...   
2507          0    324.0    417.0                4.0                 -4.0   
2508          1    416.0    398.0                7.0                  1.0   
2509          1    266.0    301.0                6.0                  5.5   
2510          0    332.0    306.0                8.0                  7.0   
2511          1    384.0    372.0               11.0                 -3.5   

      Predictions  week  
0               1     1  
1               1     1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['week'] = nflgames['Week']


In [14]:
wrong_week_frequency = dict()
for i in range(len(new['Home Win?'])):
    if new['Home Win?'][i] != new['Predictions'][i]:
        if new['week'][i] not in wrong_week_frequency.keys():
            wrong_week_frequency[new['week'][i]] = 1
        else:
            wrong_week_frequency[new['week'][i]] += 1
print(wrong_week_frequency)

{1: 35, 2: 14, 4: 9, 5: 5, 8: 9, 10: 8, 12: 11, 14: 11, 3: 7, 11: 15, 13: 8, 15: 10, 16: 10, 17: 10, 9: 9, 6: 8, 7: 1}
