In [77]:
import pandas as pd

In [78]:
#Read data
df = pd.read_csv("../model_data_more_features.csv", index_col=[0])
cols = ['AdjO_Diff','AdjD_Diff','SOS_Diff (AdjEM)','R+T_Diff', '3Pt%_Diff', 
        'OReb%_Diff', '2Pt%D_Diff', 'FTR_Diff', 'Scoring_Margin_Diff', 'Outcome']
df = df[cols]
df

Unnamed: 0,AdjO_Diff,AdjD_Diff,SOS_Diff (AdjEM),R+T_Diff,3Pt%_Diff,OReb%_Diff,2Pt%D_Diff,FTR_Diff,Scoring_Margin_Diff,Outcome
0,27.3,-13.7,18.44,14.0,-0.3,1.5,-0.6,-1.9,17.0,1
1,14.3,-5.2,-2.94,15.8,2.7,3.1,0.5,7.4,17.4,1
2,13.0,-2.9,-1.84,17.5,-0.5,5.6,0.7,9.7,14.3,1
3,10.7,1.8,-4.92,5.1,1.9,-5.2,4.6,-0.9,13.3,1
4,9.5,-4.6,-6.03,9.5,-2.5,0.2,-2.6,4.0,18.7,1
...,...,...,...,...,...,...,...,...,...,...
393,5.3,3.1,-1.28,-0.9,-4.1,1.1,-0.3,8.6,3.7,1
394,-17.8,16.3,-24.17,-1.1,-9.1,3.3,3.0,9.3,-5.8,0
395,-0.7,-0.5,4.26,-2.8,-5.1,1.0,0.7,-4.2,0.4,0
396,-28.0,8.9,-18.34,-17.5,-9.1,-12.7,2.6,1.7,-15.4,0


In [79]:
from sklearn.model_selection import train_test_split

X, y = df.drop('Outcome', axis=1), df[['Outcome']]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.65,test_size=0.35, random_state=42)

In [80]:
import xgboost as xgb 
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt

xgb1 = XGBClassifier(
    learning_rate = 0.1,
    n_estimators = 1000,
    max_depth = 5,
    min_child_weight = 1,
    gamma = 0,
    subsample = 0.8,
    colsample_bytree = 0.8,
    objective = "binary:logistic",
    nthread = 4,
    scale_pos_weight = 1,
    seed = 27
)

xgb_param = xgb1.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
    metrics='auc', early_stopping_rounds=50, verbose_eval=10)
xgb1.set_params(n_estimators=cvresult.shape[0])
xgb1.fit(X_train, y_train,eval_metric='auc')

dtrain_predictions = xgb1.predict(X_train)
dtrain_predprob = xgb1.predict_proba(X_train)[:,1]

dtest_predictions = xgb1.predict(X_test)
dtest_predprob = xgb1.predict_proba(X_test)[:,1]

print("\nModel Report")
print("Accuracy (Train) : %.4g" % accuracy_score(y_train, dtrain_predictions))
print("AUC Score (Train): %f" % roc_auc_score(y_train, dtrain_predprob))
print("Accuracy (Test) : %.4g" % accuracy_score(y_test, dtest_predictions))
print("AUC Score (Test): %f" % roc_auc_score(y_test, dtest_predprob))

[0]	train-auc:0.88412+0.01691	test-auc:0.70577+0.05440


[10]	train-auc:0.98035+0.00395	test-auc:0.78128+0.03545
[20]	train-auc:0.99401+0.00125	test-auc:0.79297+0.03713
[30]	train-auc:0.99791+0.00117	test-auc:0.80676+0.02998
[40]	train-auc:0.99942+0.00034	test-auc:0.82126+0.03303
[50]	train-auc:0.99998+0.00004	test-auc:0.81905+0.03161
[60]	train-auc:1.00000+0.00000	test-auc:0.82117+0.02969
[70]	train-auc:1.00000+0.00000	test-auc:0.82359+0.02771
[80]	train-auc:1.00000+0.00000	test-auc:0.82486+0.03031
[90]	train-auc:1.00000+0.00000	test-auc:0.82540+0.03070
[100]	train-auc:1.00000+0.00000	test-auc:0.82821+0.03100
[110]	train-auc:1.00000+0.00000	test-auc:0.83125+0.03012
[120]	train-auc:1.00000+0.00000	test-auc:0.82919+0.03276
[130]	train-auc:1.00000+0.00000	test-auc:0.83228+0.03519
[140]	train-auc:1.00000+0.00000	test-auc:0.83409+0.03252
[150]	train-auc:1.00000+0.00000	test-auc:0.83476+0.03546
[160]	train-auc:1.00000+0.00000	test-auc:0.83379+0.03340
[170]	train-auc:1.00000+0.00000	test-auc:0.83292+0.03547
[180]	train-auc:1.00000+0.00000	test-auc



AUC Score (Train): 1.000000
Accuracy (Test) : 0.8571
AUC Score (Test): 0.929276


In [84]:
from sklearn.model_selection import GridSearchCV

param_test1 = {
    'max_depth':range(1,9),
    'min_child_weight':range(1,4)
}

gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=271, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,cv=5)
gsearch1.fit(X_train, y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([0.20075197, 0.15795031, 0.13862529, 0.16162534, 0.16346903,
         0.16325603, 0.20670738, 0.1971673 , 0.15560875, 0.1915329 ,
         0.16958003, 0.15372424, 0.23468413, 0.2173429 , 0.16511216,
         0.21589236, 0.22703829, 0.24738522, 0.20224566, 0.17708278,
         0.15595179, 0.19684658, 0.17684574, 0.1497961 ]),
  'std_fit_time': array([0.0164089 , 0.01607357, 0.01431174, 0.01228931, 0.01271346,
         0.01000751, 0.00997878, 0.01908435, 0.01237088, 0.00507797,
         0.00771874, 0.01403294, 0.02084827, 0.04607166, 0.01933398,
         0.03472289, 0.02150327, 0.02746704, 0.01207408, 0.00896684,
         0.02083204, 0.00454828, 0.01784606, 0.00765324]),
  'mean_score_time': array([0.01230321, 0.00979075, 0.01538782, 0.01166911, 0.00963993,
         0.01303282, 0.01130095, 0.01449318, 0.01423855, 0.01292286,
         0.01151738, 0.01221294, 0.01153607, 0.01113501, 0.01353102,
         0.01116481, 0.01417475, 0.01334119, 0.01403093, 0.01092825,
  

In [85]:
param_test2 = {
    'gamma':[i/10.0 for i in range(0,5)]
}

gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=271, max_depth=6,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,cv=5)
gsearch2.fit(X_train, y_train)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

({'mean_fit_time': array([0.41831799, 0.32076187, 0.16592603, 0.16775169, 0.13974357]),
  'std_fit_time': array([0.01061066, 0.10096802, 0.00805733, 0.01056965, 0.0106047 ]),
  'mean_score_time': array([0.01226168, 0.01012697, 0.0099298 , 0.01132846, 0.01042113]),
  'std_score_time': array([0.00169623, 0.0010246 , 0.00102286, 0.00264234, 0.00358704]),
  'param_gamma': masked_array(data=[0.0, 0.1, 0.2, 0.3, 0.4],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'gamma': 0.0},
   {'gamma': 0.1},
   {'gamma': 0.2},
   {'gamma': 0.3},
   {'gamma': 0.4}],
  'split0_test_score': array([0.83851852, 0.83703704, 0.84      , 0.82518519, 0.81333333]),
  'split1_test_score': array([0.88      , 0.89037037, 0.88444444, 0.89185185, 0.87703704]),
  'split2_test_score': array([0.85333333, 0.8562963 , 0.85185185, 0.86666667, 0.85333333]),
  'split3_test_score': array([0.83796296, 0.83179012, 0.82716049, 0.82716049, 0.83179012]),

In [86]:
xgb2 = XGBClassifier(
    learning_rate = 0.1,
    n_estimators = 1000,
    max_depth = 6,
    min_child_weight = 1,
    gamma = 0,
    subsample = 0.8,
    colsample_bytree = 0.8,
    objective = "binary:logistic",
    nthread = 4,
    scale_pos_weight = 1,
    seed = 27
)

xgb_param = xgb2.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb2.get_params()['n_estimators'], nfold=5,
    metrics='auc', early_stopping_rounds=50, verbose_eval=10)
xgb2.set_params(n_estimators=cvresult.shape[0])
xgb2.fit(X_train, y_train,eval_metric='auc')

dtrain_predictions = xgb2.predict(X_train)
dtrain_predprob = xgb2.predict_proba(X_train)[:,1]

dtest_predictions = xgb2.predict(X_test)
dtest_predprob = xgb2.predict_proba(X_test)[:,1]

print("\nModel Report")
print("Accuracy (Train) : %.4g" % accuracy_score(y_train, dtrain_predictions))
print("AUC Score (Train): %f" % roc_auc_score(y_train, dtrain_predprob))
print("Accuracy (Test) : %.4g" % accuracy_score(y_test, dtest_predictions))
print("AUC Score (Test): %f" % roc_auc_score(y_test, dtest_predprob))

[0]	train-auc:0.88562+0.01800	test-auc:0.70810+0.05573
[10]	train-auc:0.98718+0.00295	test-auc:0.78092+0.04184


[20]	train-auc:0.99699+0.00105	test-auc:0.80084+0.04118
[30]	train-auc:0.99885+0.00074	test-auc:0.80894+0.03808
[40]	train-auc:0.99955+0.00037	test-auc:0.82164+0.03934
[50]	train-auc:1.00000+0.00000	test-auc:0.82544+0.04397
[60]	train-auc:1.00000+0.00000	test-auc:0.82407+0.03990
[70]	train-auc:1.00000+0.00000	test-auc:0.82753+0.03905
[80]	train-auc:1.00000+0.00000	test-auc:0.82629+0.03815
[90]	train-auc:1.00000+0.00000	test-auc:0.82783+0.03763
[100]	train-auc:1.00000+0.00000	test-auc:0.83116+0.03932
[110]	train-auc:1.00000+0.00000	test-auc:0.83418+0.04015
[120]	train-auc:1.00000+0.00000	test-auc:0.83394+0.03969
[130]	train-auc:1.00000+0.00000	test-auc:0.83429+0.04047
[140]	train-auc:1.00000+0.00000	test-auc:0.83364+0.03980
[150]	train-auc:1.00000+0.00000	test-auc:0.83487+0.03939
[160]	train-auc:1.00000+0.00000	test-auc:0.83519+0.04192
[170]	train-auc:1.00000+0.00000	test-auc:0.83332+0.03913
[180]	train-auc:1.00000+0.00000	test-auc:0.83547+0.03917
[190]	train-auc:1.00000+0.00000	test-au




Model Report
Accuracy (Train) : 1
AUC Score (Train): 1.000000
Accuracy (Test) : 0.85
AUC Score (Test): 0.932771


In [87]:
param_test3 = {
 'subsample':[i/10.0 for i in range(5,10)],
 'colsample_bytree':[i/10.0 for i in range(5,10)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=259, max_depth=6,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4, cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

({'mean_fit_time': array([0.3798594 , 0.2383781 , 0.19953337, 0.19788022, 0.19724665,
         0.18695006, 0.22454567, 0.30096025, 0.19961619, 0.24050212,
         0.21064105, 0.18277817, 0.2677505 , 0.23163676, 0.2554059 ,
         0.20710149, 0.20259728, 0.21755123, 0.22108884, 0.21302319,
         0.22333236, 0.19223895, 0.25790677, 0.21221328, 0.19367085]),
  'std_fit_time': array([0.05594044, 0.03986699, 0.01526603, 0.01326252, 0.01136216,
         0.0055691 , 0.02603405, 0.05932864, 0.01749761, 0.04296564,
         0.03660924, 0.01570327, 0.05186819, 0.01083564, 0.01774182,
         0.02691032, 0.01018992, 0.01773265, 0.01335083, 0.0150004 ,
         0.02228836, 0.01262578, 0.01985611, 0.01033968, 0.02162213]),
  'mean_score_time': array([0.01046391, 0.0114871 , 0.01075692, 0.00973301, 0.01162944,
         0.01453538, 0.01358681, 0.01113544, 0.01173038, 0.01241488,
         0.01268687, 0.01113219, 0.0113131 , 0.01466308, 0.01189632,
         0.01463237, 0.01269102, 0.01126461, 0.

In [88]:
param_test4 = {
 'subsample':[i/100.0 for i in range(40,70,5)],
 'colsample_bytree':[i/100.0 for i in range(70,90,5)]
}

gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=259, max_depth=6,
 min_child_weight=1, gamma=0.0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4, cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

({'mean_fit_time': array([0.37868614, 0.33269982, 0.26796937, 0.26225057, 0.25031161,
         0.26334982, 0.19231772, 0.18086915, 0.22341852, 0.21734319,
         0.20610027, 0.19448028, 0.17782011, 0.17365537, 0.18888454,
         0.19394979, 0.18515716, 0.25915055, 0.21213861, 0.18382769,
         0.1893743 , 0.22760911, 0.18376431, 0.17661834]),
  'std_fit_time': array([0.01329062, 0.04210454, 0.0074288 , 0.01249294, 0.01035141,
         0.01680409, 0.02383253, 0.00683609, 0.0308585 , 0.0192721 ,
         0.02953179, 0.02575529, 0.01375938, 0.00847067, 0.01146423,
         0.01556448, 0.01885649, 0.0280457 , 0.02732784, 0.00522051,
         0.01719797, 0.02497666, 0.01547705, 0.02777381]),
  'mean_score_time': array([0.01090598, 0.01380496, 0.01370196, 0.0099339 , 0.01456027,
         0.01253157, 0.0129849 , 0.01356478, 0.01193132, 0.01046772,
         0.01197009, 0.01481137, 0.01203036, 0.0117979 , 0.01419911,
         0.01292872, 0.01216421, 0.01165829, 0.01263146, 0.01343803,
  

In [89]:
param_test5 = {
    'reg_alpha':[1e-15, 1e-10, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
}

gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=259, max_depth=6,
 min_child_weight=1, gamma=0.0, subsample=0.65, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=4, cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_

({'mean_fit_time': array([0.39242067, 0.32720675, 0.27969184, 0.20371337, 0.27802668,
         0.24141941, 0.20921421]),
  'std_fit_time': array([0.03656939, 0.01469398, 0.03672386, 0.02085325, 0.03025326,
         0.01117922, 0.02978486]),
  'mean_score_time': array([0.01013412, 0.01691732, 0.01153049, 0.00993195, 0.01251678,
         0.01343789, 0.01113338]),
  'std_score_time': array([0.00080314, 0.00374076, 0.00331163, 0.00107251, 0.00281729,
         0.00423213, 0.0035172 ]),
  'param_reg_alpha': masked_array(data=[1e-15, 1e-10, 1e-05, 0.0001, 0.001, 0.01, 0.1],
               mask=[False, False, False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'reg_alpha': 1e-15},
   {'reg_alpha': 1e-10},
   {'reg_alpha': 1e-05},
   {'reg_alpha': 0.0001},
   {'reg_alpha': 0.001},
   {'reg_alpha': 0.01},
   {'reg_alpha': 0.1}],
  'split0_test_score': array([0.84888889, 0.84888889, 0.84888889, 0.84888889, 0.84888889,
         0.85037037, 0.8414

In [90]:
xgb3 = XGBClassifier(
    learning_rate = 0.1,
    n_estimators = 1000,
    max_depth = 6,
    min_child_weight = 1,
    gamma = 0.0,
    subsample = 0.65,
    colsample_bytree = 0.8,
    reg_alpha=1e-5,
    objective = "binary:logistic",
    nthread = 4,
    scale_pos_weight = 1,
    seed = 27
)

xgb_param = xgb3.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb3.get_params()['n_estimators'], nfold=5,
    metrics='auc', early_stopping_rounds=50, verbose_eval=10)
xgb3.set_params(n_estimators=cvresult.shape[0])
xgb3.fit(X_train, y_train,eval_metric='auc')

dtrain_predictions = xgb3.predict(X_train)
dtrain_predprob = xgb3.predict_proba(X_train)[:,1]

dtest_predictions = xgb3.predict(X_test)
dtest_predprob = xgb3.predict_proba(X_test)[:,1]

print("\nModel Report")
print("Accuracy (Train) : %.4g" % accuracy_score(y_train, dtrain_predictions))
print("AUC Score (Train): %f" % roc_auc_score(y_train, dtrain_predprob))
print("Accuracy (Test) : %.4g" % accuracy_score(y_test, dtest_predictions))
print("AUC Score (Test): %f" % roc_auc_score(y_test, dtest_predprob))

[0]	train-auc:0.84590+0.02486	test-auc:0.70695+0.02812
[10]	train-auc:0.97392+0.00211	test-auc:0.79232+0.05625


[20]	train-auc:0.98885+0.00106	test-auc:0.81074+0.05222
[30]	train-auc:0.99552+0.00126	test-auc:0.81764+0.04701
[40]	train-auc:0.99817+0.00075	test-auc:0.82927+0.04047
[50]	train-auc:0.99962+0.00020	test-auc:0.83602+0.04141
[60]	train-auc:0.99992+0.00007	test-auc:0.82807+0.04488
[70]	train-auc:1.00000+0.00000	test-auc:0.83280+0.04298
[80]	train-auc:1.00000+0.00000	test-auc:0.83247+0.03937
[90]	train-auc:1.00000+0.00000	test-auc:0.83119+0.03882
[100]	train-auc:1.00000+0.00000	test-auc:0.83429+0.04032
[101]	train-auc:1.00000+0.00000	test-auc:0.83490+0.03967

Model Report
Accuracy (Train) : 0.9884
AUC Score (Train): 0.999819
Accuracy (Test) : 0.8714
AUC Score (Test): 0.919613




In [91]:
xgb4 = XGBClassifier(
    learning_rate = 0.05,
    n_estimators = 5000,
    max_depth = 6,
    min_child_weight = 1,
    gamma = 0.0,
    subsample = 0.65,
    colsample_bytree = 0.8,
    reg_alpha=1e-5,
    objective = "binary:logistic",
    nthread = 4,
    scale_pos_weight = 1,
    seed = 27
)

xgb_param = xgb4.get_xgb_params()
xgtrain = xgb.DMatrix(X_train, y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb4.get_params()['n_estimators'], nfold=5,
    metrics='auc', early_stopping_rounds=50, verbose_eval=10)
xgb4.set_params(n_estimators=cvresult.shape[0])
xgb4.fit(X_train, y_train,eval_metric='auc')

dtrain_predictions = xgb4.predict(X_train)
dtrain_predprob = xgb4.predict_proba(X_train)[:,1]

dtest_predictions = xgb4.predict(X_test)
dtest_predprob = xgb4.predict_proba(X_test)[:,1]

print("\nModel Report")
print("Accuracy (Train) : %.4g" % accuracy_score(y_train, dtrain_predictions))
print("AUC Score (Train): %f" % roc_auc_score(y_train, dtrain_predprob))
print("Accuracy (Test) : %.4g" % accuracy_score(y_test, dtest_predictions))
print("AUC Score (Test): %f" % roc_auc_score(y_test, dtest_predprob))

[0]	train-auc:0.84590+0.02486	test-auc:0.70695+0.02812
[10]	train-auc:0.96247+0.00201	test-auc:0.78418+0.04632


[20]	train-auc:0.97706+0.00157	test-auc:0.78185+0.04457
[30]	train-auc:0.98545+0.00142	test-auc:0.80125+0.03801
[40]	train-auc:0.98944+0.00215	test-auc:0.80645+0.03727
[50]	train-auc:0.99332+0.00173	test-auc:0.80915+0.03627
[60]	train-auc:0.99616+0.00091	test-auc:0.81371+0.03312
[70]	train-auc:0.99763+0.00094	test-auc:0.81465+0.03343
[80]	train-auc:0.99889+0.00045	test-auc:0.81772+0.03870
[90]	train-auc:0.99934+0.00038	test-auc:0.82289+0.03524
[100]	train-auc:0.99960+0.00033	test-auc:0.82572+0.03649
[110]	train-auc:0.99979+0.00019	test-auc:0.82726+0.03603
[120]	train-auc:0.99992+0.00015	test-auc:0.83097+0.03637
[130]	train-auc:0.99998+0.00004	test-auc:0.83096+0.03574
[140]	train-auc:0.99998+0.00004	test-auc:0.83370+0.03592
[150]	train-auc:0.99998+0.00004	test-auc:0.83376+0.03914
[160]	train-auc:0.99998+0.00004	test-auc:0.83400+0.03841
[170]	train-auc:1.00000+0.00000	test-auc:0.83428+0.03692
[180]	train-auc:1.00000+0.00000	test-auc:0.83429+0.03797
[190]	train-auc:1.00000+0.00000	test-au



In [92]:
#Read in prediction data
pred_df = pd.read_csv('../prediction_data_more_features.csv', index_col=[0])
pred_df

Unnamed: 0,Team_A,Team_A_AdjO,Team_A_AdjD,Team_A_SOS (AdjEM),Team_A_R+T,Team_A_3Pt%,Team_A_OReb%,Team_A_2Pt%D,Team_A_FTR,Team_A_Scoring_Margin,...,Team_B_Scoring_Margin,AdjO_Diff,AdjD_Diff,SOS_Diff (AdjEM),R+T_Diff,3Pt%_Diff,OReb%_Diff,2Pt%D_Diff,FTR_Diff,Scoring_Margin_Diff
0,Akron,107.0,102.0,-1.85,5.7,32.8,30.9,50.3,32.6,8.2,...,9.7,-18.5,-1.1,-15.42,0.7,-3.7,-4.1,-0.8,-2.6,-1.5
1,Akron,107.0,102.0,-1.85,5.7,32.8,30.9,50.3,32.6,8.2,...,15.7,-14.1,7.5,-12.71,-11.2,-4.3,-5.4,2.5,-4.1,-7.5
2,Akron,107.0,102.0,-1.85,5.7,32.8,30.9,50.3,32.6,8.2,...,15.3,-13.5,10.3,-11.40,1.1,-2.4,-2.0,7.5,-5.6,-7.1
3,Akron,107.0,102.0,-1.85,5.7,32.8,30.9,50.3,32.6,8.2,...,9.3,-15.4,1.8,-15.76,-3.6,-6.0,-5.1,-1.6,-7.1,-1.1
4,Akron,107.0,102.0,-1.85,5.7,32.8,30.9,50.3,32.6,8.2,...,8.3,-7.2,4.5,-11.48,-7.5,-1.7,-2.8,-1.0,-4.2,-0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4551,Yale,110.8,101.3,2.14,8.1,34.7,29.5,47.4,26.9,8.5,...,4.0,6.2,7.8,-6.27,6.3,-1.6,3.5,0.8,1.8,4.5
4552,Yale,110.8,101.3,2.14,8.1,34.7,29.5,47.4,26.9,8.5,...,1.5,14.7,-4.6,10.31,1.2,2.5,-1.2,-3.0,0.7,7.0
4553,Yale,110.8,101.3,2.14,8.1,34.7,29.5,47.4,26.9,8.5,...,7.4,-2.7,4.1,-4.92,-0.4,0.7,-3.4,0.5,-4.8,1.1
4554,Yale,110.8,101.3,2.14,8.1,34.7,29.5,47.4,26.9,8.5,...,6.5,5.2,-1.2,4.60,5.5,0.7,0.7,-0.4,-6.6,2.0


In [93]:
#Make predictions
cols = ['AdjO_Diff','AdjD_Diff','SOS_Diff (AdjEM)','R+T_Diff', '3Pt%_Diff', 
        'OReb%_Diff', '2Pt%D_Diff', 'FTR_Diff', 'Scoring_Margin_Diff']
predictions = xgb4.predict(pred_df[cols])
pred_df['prediction'] = predictions
save_cols = ['Team_A', 'Team_B', 'prediction']
save_df = pred_df[save_cols]
pd.DataFrame(save_df.values, columns=save_df.columns).to_csv(f"predictions.csv")