In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import warnings
import statsmodels.api as sm
from sklearn.calibration import calibration_curve
from sklearn import preprocessing, svm, linear_model, model_selection
from sklearn.model_selection import learning_curve, cross_validate, ShuffleSplit, train_test_split
warnings.filterwarnings('ignore')

In [2]:
pred_set = pd.read_csv('gamesDiffs.csv')
df = pd.read_csv("NFLdataset.csv")
df['game_result'] = df['home_result'].map({'Win': 1, 'Loss': 0})

In [3]:
print(df['game_result'])

0       0
1       0
2       1
3       0
4       0
       ..
4531    1
4532    1
4533    0
4534    0
4535    0
Name: game_result, Length: 4536, dtype: int64


In [4]:
train_set = df.loc[df['date'] > '2014-08-03']
train_set = train_set.loc[train_set['date'] < '2018-08-03']

# Map win/loss to 1/0
train_set['home_result'] = train_set['home_result'].map({'Win': 1, 'Loss': 0})

# Calculate passing yardage differential
train_set['yppa_diff'] = train_set['yppa_home'] - train_set['yppa_away']

# Calculate rushing yardage differential
train_set['rush_diff'] = train_set['rush_home'] - train_set['rush_away']

# Calculate drive point differential
train_set['drive_points_diff'] = train_set['drive_points_home'] - train_set['drive_points_away']

# Calculate drive point differential
train_set['turnover_diff'] = train_set['turnovers_home'] - train_set['turnovers_away']

# Calculate sack_times differential
train_set['sack_times_diff'] = train_set['sack_times_home'] - train_set['sack_times_away']

# Calculate sack_yards differential
train_set['sack_yards_diff'] = train_set['sack_yards_home'] - train_set['sack_yards_away']

In [5]:
pred_set.rename(columns = {'d_p_diff':'drive_points_diff'}, inplace = True)
pred_set.rename(columns = {'turn_diff':'turnover_diff'}, inplace = True)
pred_set.rename(columns = {'sack_t_diff':'sack_times_diff'}, inplace = True)
pred_set.rename(columns = {'sack_y_diff':'sack_yards_diff'}, inplace = True)

In [6]:
train_set = train_set[['season', 'week', 'home', 'away', 'game_result', 'yppa_diff', 'rush_diff', 'drive_points_diff', 'turnover_diff', 'sack_yards_diff', 'sack_times_diff']]
#train_set.to_csv('C:/Users/Michael/Desktop/CASE4/Project/train_set.csv')

In [7]:
training_cols = [col for col in train_set if col.endswith('diff')]
testing_cols = [col for col in pred_set if col.endswith('diff')]
X_train = train_set[training_cols]
X_test = pred_set[testing_cols]
y_train = train_set['game_result']
y_test = pred_set['game_result']
#X_train.drop(x_train.columns[1], axis=1, inplace=True)

In [8]:
#df = df.loc[df['date'] > '2014-08-03']
#display(train_set)
df.to_csv(("original_dataset.csv"))
train_set.to_csv("diff_dataset.csv")

---------------------------
##### Our own Log Reg Model
---------------------------

In [9]:
class LogisticRegression:
    def __init__(self, lr=0.01, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        # Init paramaters (weights)
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient Decent
        for _ in range(self.n_iters):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self._sigmoid(linear_model)

            dw = (1 / n_samples) * np.dot(X.T, (y_predicted-y))
            db = (1 / n_samples) * np.sum(y_predicted-y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return y_predicted_cls

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))


In [10]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy * 100

---------------------------
##### Our own Log Reg Model
---------------------------

In [11]:
model = LogisticRegression(lr = 0.0001, n_iters=1000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [12]:
loss = len(y_test) - 167

In [13]:
#print("          | Win | Loss | Win%")
#print("Our Model | 167 | {}  | {:.1f}% ".format(loss, accuracy(y_test, sk_predictions)))

In [14]:
# total number of games
total_games = len(y_test)

# total number of home wins
home_wins = np.sum(y_test)
#print(home_wins)
    

# total home wins/total number of games
home_win_rate = home_wins/total_games
#print(total_games)
print("Home Team Win Rate: {:.2f}% ".format(home_win_rate*100))

Home Team Win Rate: 59.18% 


In [15]:
import unittest

class Test_training_set(unittest.TestCase):
    
    # row needs to be > 3200 for training set
    def test_yppa(self, row):
        home_pass = df.loc[row, 'yppa_home']
        away_pass = df.loc[row, 'yppa_away']
        train_pass = train_set.loc[row, 'yppa_diff']
        self.assertEqual(home_pass - away_pass, train_pass)
        self.assertFalse(home_pass + away_pass ==  train_pass)

    def test_rush(self, row):
        home_rush = df.loc[row, 'rush_home']
        away_rush = df.loc[row, 'rush_away']
        train_rush = train_set.loc[row, 'rush_diff']
        self.assertEqual(home_rush - away_rush, train_rush)
        self.assertFalse(home_rush + away_rush ==  train_rush)

    def test_turnovers(self, row):
        home_turnovers = df.loc[row, 'turnovers_home']
        away_turnovers = df.loc[row, 'turnovers_away']
        train_turnovers = train_set.loc[row, 'turnover_diff']
        self.assertEqual(home_turnovers - away_turnovers, train_turnovers)
        self.assertFalse(home_turnovers + away_turnovers ==  train_turnovers)
        
if __name__ == '__main__':
    unittest.main()

E
ERROR: /home/johnd/ (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/home/johnd/'

----------------------------------------------------------------------
Ran 1 test in 0.001s

FAILED (errors=1)


SystemExit: True

In [16]:
# df[yppa_home] - df[yppa_away] = yppa_diff
#print(df.loc[3201, 'yppa_home'])
#print(df.loc[3201, 'yppa_away'])
#print(train_set.loc[3201, 'yppa_diff'])
print(df.columns)

Index(['Unnamed: 0', 'date', 'week', 'season', 'away', 'home',
       'first_downs_away', 'first_downs_home', 'third_downs_away',
       'third_downs_home', 'fourth_downs_away', 'fourth_downs_home',
       'passing_yards_away', 'passing_yards_home', 'rushing_yards_away',
       'rushing_yards_home', 'total_yards_away', 'total_yards_home',
       'comp_att_away', 'comp_att_home', 'sacks_away', 'sacks_home',
       'rushing_attempts_away', 'rushing_attempts_home', 'fumbles_away',
       'fumbles_home', 'int_away', 'int_home', 'turnovers_away',
       'turnovers_home', 'penalties_away', 'penalties_home', 'redzone_away',
       'redzone_home', 'drives_away', 'drives_home', 'def_st_td_away',
       'def_st_td_home', 'away_poss', 'home_poss', 'score_away', 'score_home',
       'attempts_completed_home', 'attempts_home', 'away_attempts_completed',
       'away_attempts', 'yppa_away', 'yppa_home', 'rush_home', 'rush_away',
       'drive_points_home', 'drive_points_away', 'home_result',
       

---------------------------
#### SkLearn Log Reg Model
---------------------------

In [17]:
clf = linear_model.LogisticRegression(C=1e5)
clf.fit(X_train, y_train)
conf = clf.decision_function(X_test)
sk_predictions = clf.predict(X_test)

In [18]:
print("SK models accuracy: ", accuracy(y_test, sk_predictions))

SK models accuracy:  62.546816479400746


---------------------------
#### SkLearn Log RegCV Model
---------------------------

In [19]:
from sklearn.linear_model import LogisticRegressionCV
modelCV = LogisticRegressionCV(cv=5, random_state=0).fit(X_train, y_train)
skCV_preds = modelCV.predict(X_test)
probsCV = clf.predict_proba(X_train)
print("SK CV models accuracy: ", accuracy(y_test, skCV_preds))

SK CV models accuracy:  62.172284644194754


In [20]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=500)
rf.fit(X_train, y_train)

#print(rf.score(X_test, y_test))
rfPred = rf.predict(X_test)
print(len(rfPred))

267


In [21]:
#X_train, X_test, y_train, y_test = train_test_split(X_test, y_test,test_size=0.2)

In [22]:
probs = rf.predict(scaler.transform(matchups))

NameError: name 'scaler' is not defined

In [23]:
display(len(X_test))

267

In [34]:
from sklearn.svm import SVC
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)

print(svm.score(X_test, y_test))
print(svm.predict(X_test))

0.5805243445692884
[0 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 0 0 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 0 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 0 0 1 1 0 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 0
 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1
 0 1 1 1 1 0 1 0]


In [24]:
import pickle

In [25]:
with open('model_pickle1.pkl', 'wb') as f:
    pickle.dump(svm,f, protocol = 2)

TypeError: can't pickle module objects

In [26]:
with open('model_pickle1.pkl', 'rb') as f:
    mp = pickle.load(f)

EOFError: Ran out of input

In [27]:
print(mp.predict(X_test))

NameError: name 'mp' is not defined

In [28]:
from sklearn.externals import joblib
joblib.dump(svm, 'classifier.joblib', protocol=2)


TypeError: can't pickle module objects

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib
from sklearn import datasets


In [26]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)

In [27]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, prediction))
print(accuracy(y_test, prediction))

Confusion Matrix:
[[66 43]
 [59 99]]
61.79775280898876


In [12]:
joblib.dump(classifier, 'classifier.joblib', protocol = 2)

['classifier.joblib']

In [13]:
print(prediction)

[0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 1 1 0 0 0 1 0 1 1 0 1 1 1 0 0 1 1 1
 0 0 0 1 1 1 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 1 0 1 1 1 0 1 1
 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1 1 1 0 0 0 1 1 1 1 1 0 0
 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 0 0 1 1 1 1
 1 1 1 0 1 0 0 1 0 1 1 0 1 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 0 1 1 0
 0 1 0 1 1 0 1 1 1 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0
 0 1 1 1 0 0 0 1 1 1 1 1 0 0 1 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 1 0 0 0 0 1 1
 0 1 1 1 0 0 1 0]
