In [7]:
import pandas as pd


In [8]:
%run ../functions.ipynb

### Import data

In [9]:
#get sample dataset
test = pd.read_csv('../../data/MSampleSubmissionStage1_2020.csv')
test = process_sample_sub(test)

In [11]:
#processed data
all_games = pd.read_csv('../feature_engineering/output/all_games.csv')


### Set up

In [12]:
#remove tournament games we are predicting on 
all_games = all_games[((all_games.Season >= 2015) & (all_games.TourneyGame == 0))
                  | (all_games.Season < 2015)]


### Train linear model to predict efficiencies

In [13]:
#Train linear model
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score

#measures of how your performance, your opp's performance and both of your historical competitions
reg_feature_list = ['Team1_avg_oe', 'Team1_avg_de','Team1_avg_opp_avg_oe','Team1_avg_opp_avg_de',
                           'Team1_avg_opp_avg_opp_avg_oe', 'Team1_avg_opp_avg_opp_avg_de',
                            'Team1_avg_opp_avg_opp_avg_opp_avg_oe', 'Team1_avg_opp_avg_opp_avg_opp_avg_de',
                           'Team1_avg_opp_avg_opp_avg_opp_avg_opp_avg_oe',
                               'Team1_avg_opp_avg_opp_avg_opp_avg_opp_avg_de',
                    
                   'Team2_avg_oe', 'Team2_avg_de','Team2_avg_opp_avg_oe','Team2_avg_opp_avg_de',
                           'Team2_avg_opp_avg_opp_avg_oe', 'Team2_avg_opp_avg_opp_avg_de',
                            'Team2_avg_opp_avg_opp_avg_opp_avg_oe', 'Team2_avg_opp_avg_opp_avg_opp_avg_de',
                           'Team2_avg_opp_avg_opp_avg_opp_avg_opp_avg_oe',
                               'Team2_avg_opp_avg_opp_avg_opp_avg_opp_avg_de']

all_games = all_games.dropna(subset = reg_feature_list)

df = all_games.copy()

#OE model
y = 'Team1_OffRtg'
model = linear_model.LinearRegression()

X_full = df[reg_feature_list].values
y_full = df[y].values
X_train, X_test, y_train, y_test = train_test_split(df[reg_feature_list], df[y], test_size = .2, random_state = 0)

oe_model = linear_model.LinearRegression(normalize = 'True')
oe_model.fit(X_train, y_train)

#DE Model
y = 'Team1_DefRtg'
model = linear_model.LinearRegression()

X_full = df[reg_feature_list].values
y_full = df[y].values
X_train, X_test, y_train, y_test = train_test_split(df[reg_feature_list], df[y], test_size = .2, random_state = 0)

de_model = linear_model.LinearRegression(normalize = 'True')
de_model.fit(X_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize='True')

In [14]:
all_games['t1_pred_oe'] = oe_model.predict(all_games[reg_feature_list].values)
all_games['t1_pred_de'] = de_model.predict(all_games[reg_feature_list].values)

In [15]:
train = all_games[all_games.TourneyGame == 1]

### Logistic model to predict %

In [16]:
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,TourneyGame,Season,DayNum,Team1,Team2,Team1_score,Team2_score,WLoc,...,Team2_avg_opp_avg_opp_avg_opp_avg_oe,Team2_avg_opp_avg_opp_avg_opp_avg_de,Team2_opp_avg_opp_avg_opp_avg_opp_avg_oe,Team2_opp_avg_opp_avg_opp_avg_opp_avg_de,Team2_avg_opp_avg_opp_avg_opp_avg_opp_avg_oe,Team2_avg_opp_avg_opp_avg_opp_avg_opp_avg_de,Team2_Seed,Seed_Diff,t1_pred_oe,t1_pred_de
83,83,20,1,2003,137,1104,1231,62,67,N,...,105.644309,100.708799,105.24034,101.246967,104.395899,102.682732,7,3,109.064224,109.987823
283,283,1,1,2003,136,1112,1436,80,51,N,...,100.841206,104.400967,105.225967,101.301322,102.396114,103.091542,16,-15,115.220135,99.533473
284,284,33,1,2003,138,1112,1211,96,95,N,...,102.784377,103.030696,105.106742,101.446902,102.546447,103.239985,9,-8,113.888298,105.714808
285,285,49,1,2003,143,1112,1323,88,71,N,...,106.672656,100.302686,105.089853,101.522128,103.807993,102.284856,5,-4,111.031669,107.934742
286,286,57,1,2003,145,1112,1242,75,78,N,...,105.047577,103.094185,105.130242,101.542222,103.785191,102.956152,2,-1,106.53368,109.628201


In [17]:
feature_list = ['t1_pred_oe', 't1_pred_de']


#### Cross validation

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.feature_selection import chi2
from sklearn.metrics import r2_score
import numpy as np

In [19]:
import warnings
warnings.filterwarnings('ignore') 

In [20]:
X = train[feature_list].values
y = train['Outcome'].values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


#### Logistic Regression

In [21]:

#Grid search to get best params
clf = LogisticRegression(random_state = 0)
params = {'C': np.logspace(start=-5, stop=3, num=9), 'penalty': ['l2', 'l1']}
clf = GridSearchCV(clf, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
best_param = clf.best_params_

#Use best params to train final model
logreg = LogisticRegression(**best_param)
logreg.fit(X_train, y_train)

#Evaluate score on test set 
y_pred = logreg.predict_proba(X_test)
log_loss(y_test, y_pred)



0.5365725796673898

In [22]:
#fit on entire dataset
logreg = LogisticRegression(**best_param)
logreg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

#### SVC

In [23]:
''' 
from sklearn import svm
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
clf = GridSearchCV(svm.SVC(kernel='rbf', probability = True), param_grid, cv=5)
clf.fit(X, y)
clf.fit(X_train, y_train)
best_param = clf.best_params_


svc_ = svm.SVC(**best_param, probability = True)
svc_.fit(X_train, y_train)

y_pred = svc_.predict_proba(X_test)
log_loss(y_test, y_pred)
''' 


" \nfrom sklearn import svm\nCs = [0.001, 0.01, 0.1, 1, 10]\ngammas = [0.001, 0.01, 0.1, 1]\nparam_grid = {'C': Cs, 'gamma' : gammas}\nclf = GridSearchCV(svm.SVC(kernel='rbf', probability = True), param_grid, cv=5)\nclf.fit(X, y)\nclf.fit(X_train, y_train)\nbest_param = clf.best_params_\n\n\nsvc_ = svm.SVC(**best_param, probability = True)\nsvc_.fit(X_train, y_train)\n\ny_pred = svc_.predict_proba(X_test)\nlog_loss(y_test, y_pred)\n"

#### Random Forest

In [24]:
#Grid search to get best params
clf = RandomForestClassifier(random_state = 0)

params = {'min_samples_leaf': [50, 75, 100], 
          'max_depth': [4,5,6,7,8]}

clf = GridSearchCV(clf, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
best_param = clf.best_params_

#Use best params to train final model
rf = RandomForestClassifier(**best_param)
rf.fit(X_train, y_train)

#Evaluate score on test set 
y_pred = rf.predict_proba(X_test)
log_loss(y_test, y_pred)


0.5325476914140346

In [25]:
#fit on entire dataset
rf = RandomForestClassifier(**best_param)
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=50, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Output model and data

In [26]:
all_games.to_csv('output/all_games.csv')

In [28]:
import pickle

with open("model/rf.pkl", "wb") as file: 
    pickle.dump(rf, file)
    
with open("model/logreg.pkl", "wb") as file: 
    pickle.dump(logreg, file)
    
with open("model/oe_model.pkl", "wb") as file: 
    pickle.dump(oe_model, file)
    
with open("model/de_model.pkl", "wb") as file: 
    pickle.dump(de_model, file)
    