In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("nfl_matchups.csv")
df.head()

Unnamed: 0,Home,Away,Home Elo,Away Elo,Home %,Away %,Home Abbr,Away Abbr,Winner,Loser,Result,Week,Year,Home Score,Away Score,Correct
0,New England Patriots,Pittsburgh Steelers,2054.935547,2000.0,0.5784,0.4216,nwe,pit,nwe,pit,1,1,2015,28.0,21.0,1
1,Chicago Bears,Green Bay Packers,2054.935547,2000.0,0.5784,0.4216,chi,gnb,gnb,chi,0,1,2015,23.0,31.0,0
2,St. Louis Rams,Seattle Seahawks,2054.935547,2000.0,0.5784,0.4216,ram,sea,ram,sea,1,1,2015,34.0,31.0,1
3,Washington Redskins,Miami Dolphins,2054.935547,2000.0,0.5784,0.4216,was,mia,mia,was,0,1,2015,10.0,17.0,0
4,Houston Texans,Kansas City Chiefs,2054.935547,2000.0,0.5784,0.4216,htx,kan,kan,htx,0,1,2015,20.0,27.0,0


In [13]:
df['Correct'].sum()/len(df) * 100
df2 = df.copy()
df2 = df2.iloc[200:]
df2['Correct'].sum()/len(df2) * 100

64.1660015961692

In [4]:
X_cols = ["Home Elo", "Away Elo"]
y_cols = ["Result"]
X = df[X_cols]
y = df[y_cols].values.ravel()
X

Unnamed: 0,Home Elo,Away Elo
0,2054.935547,2000.000000
1,2054.935547,2000.000000
2,2054.935547,2000.000000
3,2054.935547,2000.000000
4,2054.935547,2000.000000
...,...,...
1448,1694.204315,2043.170226
1449,1905.293804,1851.169060
1450,2006.629867,2058.392984
1451,2109.407989,2026.893589


In [5]:
scaler = MinMaxScaler().fit(X)
X = scaler.transform(X)
X

array([[0.39160569, 0.41644695],
       [0.39160569, 0.41644695],
       [0.39160569, 0.41644695],
       ...,
       [0.35926547, 0.45523607],
       [0.42807452, 0.43431174],
       [0.33086256, 0.33547247]])

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [64]:
clf = LogisticRegression(max_iter=1000)
# clf = xgb.XGBClassifier(n_estimators=1000)
# clf = RandomForestClassifier(n_estimators=100)
# clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01)
# clf = MLPClassifier(max_iter=500)
# clf = BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000), max_samples=0.75, n_estimators=500, n_jobs=7)
# clf = BaggingClassifier(base_estimator=xgb.XGBClassifier(n_estimators=1000), n_estimators=300, n_jobs=7)
# clf = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100), max_samples=0.75, n_estimators=500, n_jobs=7)

In [61]:
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [62]:
acc = metrics.accuracy_score(y_test, y_pred)
acc*100

61.23853211009175

In [63]:
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[ 57 144]
 [ 25 210]]


In [67]:
scores = cross_validate(clf, X_train, y_train, scoring='accuracy', cv=5, return_estimator=True)
clf = scores["estimator"][np.argmax(scores["test_score"])]
scores["test_score"][np.argmax(scores["test_score"])].mean(), scores["test_score"]

(0.6372549019607843,
 array([0.6372549 , 0.6372549 , 0.57142857, 0.62561576, 0.63054187]))

In [68]:
y_pred = clf.predict(X_test)
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
conf_matrix

array([[ 57, 132],
       [ 32, 215]], dtype=int64)

In [71]:
model = {'clf':clf, "X_cols":X_cols, "y_cols":y_cols, "scaler":scaler, "name":"Model 1"}

In [72]:
model

{'clf': LogisticRegression(max_iter=1000),
 'X_cols': ['Home Elo', 'Away Elo'],
 'y_cols': ['Result'],
 'scaler': MinMaxScaler(),
 'name': 'Model 1'}

In [75]:
pickle.dump(model, open("model1.pkl", "wb"))