In [1]:
import pandas as pd


In [2]:
%run ../functions.ipynb

### Import data

In [3]:
#get sample dataset
test = pd.read_csv('../../data/MSampleSubmissionStage1_2020.csv')
test = process_sample_sub(test)


In [4]:
#processed data
all_games = pd.read_csv('../feature_engineering/output/all_games.csv')


### Set up

In [5]:
all_games.head()

Unnamed: 0.1,Unnamed: 0,type,ID,Pred,Season,Team1,Team2,Outcome,avg_rank_x,t1_final_rank,...,t1_OrdinalRank,t1_pre_season_top_25_flag,t2_OrdinalRank,t2_pre_season_top_25_flag,t1_adj_oe_0,t1_adj_de_0,t2_adj_oe_0,t2_adj_de_0,t1_adj_margin,t2_adj_margin
0,0,0,2003_1421_1411,,2003,1421,1411,1.0,237,67.33819,...,25.0,,25.0,,104.526528,119.437414,106.509926,111.907479,-14.910886,-5.397553
1,1,0,2003_1112_1436,,2003,1112,1436,1.0,2,95.514642,...,1.0,1.0,25.0,,119.071254,93.286794,106.884988,101.850357,25.78446,5.034632
2,2,0,2003_1113_1272,,2003,1113,1272,1.0,34,84.233153,...,25.0,,25.0,,121.219248,101.54141,114.009765,96.248862,19.677837,17.760903
3,3,0,2003_1141_1166,,2003,1141,1166,1.0,32,84.559424,...,25.0,,23.0,1.0,113.038016,105.370558,117.078846,97.251061,7.667458,19.827785
4,4,0,2003_1143_1301,,2003,1143,1301,1.0,33,84.394558,...,25.0,,25.0,,111.121076,101.087582,116.019889,102.254106,10.033494,13.765783


In [6]:
#remove tournament games we are predicting on 
all_games = all_games[(all_games.Season < 2015)]


In [7]:
all_games.columns

Index(['Unnamed: 0', 'type', 'ID', 'Pred', 'Season', 'Team1', 'Team2',
       'Outcome', 'avg_rank_x', 't1_final_rank', 'avg_rank_y', 't2_final_rank',
       't1_Seed', 't2_Seed', 'seed_diff', 't1_OrdinalRank',
       't1_pre_season_top_25_flag', 't2_OrdinalRank',
       't2_pre_season_top_25_flag', 't1_adj_oe_0', 't1_adj_de_0',
       't2_adj_oe_0', 't2_adj_de_0', 't1_adj_margin', 't2_adj_margin'],
      dtype='object')

In [8]:
feature_list = ['seed_diff',  
                't1_adj_margin','t2_adj_margin',
                't1_final_rank', 't2_final_rank',
                't1_OrdinalRank', 't2_OrdinalRank',
               ]
 
#feature_list = ['t1_adj_oe', 't2_adj_oe', 't1_adj_de', 't2_adj_de', 't1_adj_oe_120_999', 't2_adj_oe_120_999', 't1_adj_de_120_999', 't2_adj_de_120_999']

In [9]:
all_games = all_games.fillna(0)

#### Cross validation

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.feature_selection import chi2
from sklearn.metrics import r2_score
import numpy as np
from sklearn.model_selection import train_test_split

In [11]:
import warnings
warnings.filterwarnings('ignore') 

In [12]:
X = all_games[feature_list].values
y = all_games['Outcome'].values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


#### Logistic Regression

In [13]:
#Grid search to get best params
clf = LogisticRegression(random_state = 0)
params = {'C': np.logspace(start=-5, stop=3, num=9), 'penalty': ['l2', 'l1']}
clf = GridSearchCV(clf, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
best_param = clf.best_params_

#Use best params to train final model
logreg = LogisticRegression(**best_param)
logreg.fit(X_train, y_train)

#Evaluate score on test set 
#y_pred = logreg.predict_proba(X_test)
#log_loss(y_test, y_pred)



LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
import statistics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [15]:
#model = Pipeline([('scale', StandardScaler()),('logreg', LogisticRegression(**best_param))])

In [16]:
#Cross validation
seasons = list(all_games.Season.unique())

log_loss_list = []

for test_season in seasons:
    
    train_seasons = seasons.copy()
    train_seasons.remove(test_season)
 
    X_train = all_games[all_games['Season'].isin(train_seasons)][feature_list].values
    X_test = all_games[all_games.Season == test_season][feature_list].values

    y_train = all_games[all_games['Season'].isin(train_seasons)]['Outcome'].values.ravel()
    y_test = all_games[all_games.Season == test_season]['Outcome'].values.ravel()
    
    logreg = LogisticRegression(**best_param)
    logreg.fit(X_train, y_train)

    #Evaluate score on test set 
    y_pred = logreg.predict_proba(X_test)
    
    ll = log_loss(y_test, y_pred)
    log_loss_list.append(ll)
    
    
print('avg. log loss: {}'.format(sum(log_loss_list) / len(log_loss_list)))
print('min log loss: {}'.format(min(log_loss_list)))
print('max log loss: {}'.format(max(log_loss_list)))
print('std dev log loss: {}'.format(statistics.stdev(log_loss_list)))
print(log_loss_list)
    

avg. log loss: 0.5357415850410531
min log loss: 0.4514395461214332
max log loss: 0.627651988416418
std dev log loss: 0.04843936979238789
[0.5237920042565816, 0.5300941586898208, 0.5119495443359925, 0.5633991102726171, 0.4514395461214332, 0.4803304744910474, 0.49226181467464325, 0.536909874468868, 0.627651988416418, 0.5591074889268227, 0.5772740688201984, 0.5746889470181937]


In [17]:
#fit on entire dataset
logreg = LogisticRegression(**best_param)
logreg.fit(X, y)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

avg. log loss: 0.5377196208726289
min log loss: 0.4527681615412048
max log loss: 0.6301372973190598
std dev log loss: 0.04826156092665253
[0.5229902686047858, 0.5236486146281775, 0.5158111793350595, 0.56923309326896, 0.4527681615412048, 0.483523972814316, 0.5005711494650689, 0.5390678057159176, 0.6301372973190598, 0.562545719196487, 0.5826151696256913, 0.5697230189568193]


In [771]:
from numpy import loadtxt
from xgboost import XGBClassifier

In [772]:
#Grid search to get best params
clf = XGBClassifier(random_state = 0)

params = {'min_child_weight': [5],
        'gamma': [5, 10],
        'subsample': [0.8],
        'colsample_bytree': [0.4, 0.6],
        'max_depth': [1, 2, 3, 10, 20, 30, 50]
        }

clf = GridSearchCV(clf, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
best_param = clf.best_params_


In [773]:
best_param = clf.best_params_


In [774]:
best_param = {'colsample_bytree': 0.8,                 
              'learning_rate': 0.0003,
              'max_depth': 5,
              'subsample': 1,
              'objective':'binary:logistic',
              'eval_metric':'logloss',
              'min_child_weight':3,
              'gamma':0.25,
              'n_estimators':500,
              'verbosity':5
              }

In [775]:
#Cross validation
seasons = list(all_games.Season.unique())

log_loss_list = []

for test_season in seasons:
    
    train_seasons = seasons.copy()
    train_seasons.remove(test_season)
 
    X_train = all_games[all_games['Season'].isin(train_seasons)][feature_list].values
    X_test = all_games[all_games.Season == test_season][feature_list].values

    y_train = all_games[all_games['Season'].isin(train_seasons)]['Outcome'].values.ravel()
    y_test = all_games[all_games.Season == test_season]['Outcome'].values.ravel()
    
    clf = XGBClassifier(**best_param)
    clf.fit(X_train, y_train)

    #Evaluate score on test set 
    y_pred = clf.predict_proba(X_test)
    
    ll = log_loss(y_test, y_pred)
    log_loss_list.append(ll)
    
    
print('avg. log loss: {}'.format(sum(log_loss_list) / len(log_loss_list)))
print('min log loss: {}'.format(min(log_loss_list)))
print('max log loss: {}'.format(max(log_loss_list)))
print('std dev log loss: {}'.format(statistics.stdev(log_loss_list)))
print(log_loss_list)
    

avg. log loss: 0.6614626711755712
min log loss: 0.6542309042997658
max log loss: 0.670325395331454
std dev log loss: 0.005667239934436234
[0.6589280436746776, 0.6597465183585882, 0.6566117340698838, 0.6678383476100862, 0.654261922929436, 0.6572385858744383, 0.6542309042997658, 0.6642895918339491, 0.669623213472651, 0.6609168021536586, 0.6635409944982671, 0.670325395331454]


In [1239]:
avg. log loss: 0.5382626508307115


LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

#### SVC

In [448]:
''' 
from sklearn import svm
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
clf = GridSearchCV(svm.SVC(kernel='rbf', probability = True), param_grid, cv=5)
clf.fit(X, y)
clf.fit(X_train, y_train)
best_param = clf.best_params_


svc_ = svm.SVC(**best_param, probability = True)
svc_.fit(X_train, y_train)

y_pred = svc_.predict_proba(X_test)
log_loss(y_test, y_pred)
''' 


" \nfrom sklearn import svm\nCs = [0.001, 0.01, 0.1, 1, 10]\ngammas = [0.001, 0.01, 0.1, 1]\nparam_grid = {'C': Cs, 'gamma' : gammas}\nclf = GridSearchCV(svm.SVC(kernel='rbf', probability = True), param_grid, cv=5)\nclf.fit(X, y)\nclf.fit(X_train, y_train)\nbest_param = clf.best_params_\n\n\nsvc_ = svm.SVC(**best_param, probability = True)\nsvc_.fit(X_train, y_train)\n\ny_pred = svc_.predict_proba(X_test)\nlog_loss(y_test, y_pred)\n"

#### Random Forest

In [18]:
#Grid search to get best params
clf = RandomForestClassifier(random_state = 0)

params = {'min_samples_leaf': [1, 5, 15, 25, 50, 80, 100], 
          'bootstrap': [True, False],
          'max_depth': [1, 3, 5, 10, 20, None],
          'max_features': ['auto', 'sqrt'],
          'min_samples_split': [2, 5, 10],
         }

clf = GridSearchCV(clf, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
best_param = clf.best_params_

#Cross validation
seasons = list(all_games.Season.unique())

log_loss_list = []

for test_season in seasons:
    
    train_seasons = seasons.copy()
    train_seasons.remove(test_season)
 
    X_train = all_games[all_games['Season'].isin(train_seasons)][feature_list].values
    X_test = all_games[all_games.Season == test_season][feature_list].values

    y_train = all_games[all_games['Season'].isin(train_seasons)]['Outcome'].values.ravel()
    y_test = all_games[all_games.Season == test_season]['Outcome'].values.ravel()
    
    rf = RandomForestClassifier(**best_param)
    rf.fit(X_train, y_train)

    #Evaluate score on test set 
    y_pred = logreg.predict_proba(X_test)
    
    ll = log_loss(y_test, y_pred)
    log_loss_list.append(ll)
    
    
print('avg. log loss: {}'.format(sum(log_loss_list) / len(log_loss_list)))
print('min log loss: {}'.format(min(log_loss_list)))
print('max log loss: {}'.format(max(log_loss_list)))
print('std dev log loss: {}'.format(statistics.stdev(log_loss_list)))
print(log_loss_list)
    


avg. log loss: 0.531922722936705
min log loss: 0.4445775307006808
max log loss: 0.6229365622303814
std dev log loss: 0.048389749307892846
[0.5226171255883458, 0.5260895194839206, 0.5105384202401526, 0.5587125165429248, 0.4445775307006808, 0.47792357220814596, 0.4882708443748359, 0.5332024572220709, 0.6229365622303814, 0.5538069505726807, 0.5733605436225888, 0.5710366324537309]


In [1030]:
rf

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=25, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [183]:
#fit on entire dataset
rf = RandomForestClassifier(**best_param)
rf.fit(X, y)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=80, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Output model and data

In [19]:
all_games.to_csv('output/all_games.csv')

In [20]:
import pickle
    
with open("model/logreg.pkl", "wb") as file: 
    pickle.dump(logreg, file)

with open("model/rf.pkl", "wb") as file: 
    pickle.dump(rf, file)  

    