In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss, confusion_matrix, classification_report, balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 50)

## Modeling

- Simple Data
    - Logistic Regression
        - ~~simple~~
        - ~~tuned + cross validation~~
    - Gaussian NB
        - simple
        - cross validation
    - Random Forest
        - untuned
        - tuned
    - XGBoost
        - untuned
        - tuned
- Scaled Data
    - Logistic Regression
        - simple
        - cross validation
    - Gaussian NB
        - simple
        - cross validation
    - Random Forest
        - untuned
        - tuned
    - XGBoost
        - untuned
        - tuned
- PCA 
    - Logistic Regression
        - simple
        - cross validation
    - Gaussian NB
        - simple
        - cross validation
    - Random Forest
        - untuned
        - tuned
    - XGBoost
        - untuned
        - tuned

In [2]:
final_data = pd.read_csv('computed_data/reg_avg_data.csv')
st1_data = final_data[final_data['Season']<2017]
st1_data_x = st1_data.iloc[:,4:-1]
st1_data_y = st1_data.iloc[:,-1]
st2_data = final_data[final_data['Season']<2023]
st2_data_x = st2_data.iloc[:,4:-1]
st2_data_y = st2_data.iloc[:,-1]

In [3]:
scoring = 'neg_brier_score'

#Creating model class so that testing and tuning is easy to run all the different models.
class Model_Data:
    def __init__(self, model_ud, X, y, scoring='f1'):
        self.user_defined_model=model_ud
        self.X= X
        self.y = y
        self.scoring = scoring
        
    def split(self, test_size):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = test_size)
            
    def tune(self, params):
        cv = KFold(n_splits=2)
        self.rand_search = GridSearchCV(estimator=self.user_defined_model, param_grid=params, n_jobs=8, 
                                         cv=cv, scoring=self.scoring, verbose=2)

            
        self.rand_search = self.rand_search.fit(self.X_train, self.y_train)
        # summarize results
        print("Best: %f using %s" % (self.rand_search.best_score_, self.rand_search.best_params_))

        #means = self.rand_search.cv_results_['mean_test_score']
        #stds = self.rand_search.cv_results_['std_test_score']
        #parameters = self.rand_search.cv_results_['params']
        #for mean, stdev, parameters in zip(means, stds, parameters):
        #    print("%f (%f) with: %r" % (mean, stdev, parameters))
            
    def tune_comp(self, params):
        cv = KFold(n_splits=2)
        self.rand_search = GridSearchCV(estimator=self.user_defined_model, param_grid=params, n_jobs=8,
                                     cv=cv, scoring=self.scoring, verbose=2)
            
        self.rand_search = self.rand_search.fit(self.X, self.y)
        # summarize results
        print("Best: %f using %s" % (self.rand_search.best_score_, self.rand_search.best_params_))

        #means = self.rand_search.cv_results_['mean_test_score']
        #stds = self.rand_search.cv_results_['std_test_score']
        #parameters = self.rand_search.cv_results_['params']
        #for mean, stdev, parameters in zip(means, stds, parameters):
        #    print("%f (%f) with: %r" % (mean, stdev, parameters))
    
    def fit(self):
        self.user_defined_model.set_params(**self.rand_search.best_params_)
        self.user_defined_model = self.user_defined_model.fit(self.X_train, self.y_train)
        
    def fit_comp(self):
        self.user_defined_model.set_params(**self.rand_search.best_params_)
        self.user_defined_model = self.user_defined_model.fit(self.X, self.y)
    
    def predict(self, input_value=None):
        if input_value == None:
            result = self.user_defined_model.predict(self.X_test)
        else: 
            result = self.user_defined_model.predict(np.array([input_value]))
        return result

    def AccuracyReport(self, predictions, input_value=None):
        if input_value == None:
            print(confusion_matrix(self.y_test,predictions))
            print(classification_report(self.y_test,predictions))
            acc2 = balanced_accuracy_score(self.y_test,predictions)
            acc3 = brier_score_loss(self.y_test,predictions)
            print(acc2)
            print(acc3)
        else:
            print(confusion_matrix(np.array([input_value]),predictions))
            print(classification_report(np.array([input_value]),predictions))
            acc2 = balanced_accuracy_score(np.array([input_value]),predictions)
            acc3 = brier_score_loss(self.y_test,predictions)
            print(acc2)

params_xgb = {
    "learning_rate"    : [0.1, 0.25, 0.3] ,
    "n_estimators": [50, 100, 150],
    "max_depth"        : [3, 4, 5],
    "min_child_weight" : [1, 3, 5],
    "colsample_bytree" : [0.5, 0.7, 0.8 ],
}

params_rf = {
     'max_depth': [20, 40, 60, None],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10],
     'n_estimators': [400, 800, 1200, 1600]
}

params_logreg =    {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear'],
    'max_iter' : [100, 500, 1000]
}

In [6]:
st1_data_x_train, st1_data_x_test, st1_data_y_train, st1_data_y_test = train_test_split(st1_data_x, st1_data_y)

#### Logistic Regression Tuned + KFold

In [14]:
%%time
lr = Model_Data(LogisticRegression(), st2_data_x, st2_data_y, 'neg_brier_score')
lr.split(test_size=0.3)
lr.tune_comp(params_logreg)
lr.fit_comp()
preds = lr.predict()
lr.AccuracyReport(preds)

Fitting 2 folds for each of 540 candidates, totalling 1080 fits


600 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\siddh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\siddh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\siddh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solve

Best: -0.195001 using {'C': 0.00026366508987303583, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
[[17103  7243]
 [ 7333 17064]]
              precision    recall  f1-score   support

           0       0.70      0.70      0.70     24346
           1       0.70      0.70      0.70     24397

    accuracy                           0.70     48743
   macro avg       0.70      0.70      0.70     48743
weighted avg       0.70      0.70      0.70     48743

0.7009637939877444
0.29903781055741335
CPU times: total: 11.2 s
Wall time: 1min 58s


In [15]:
logreg_model = LogisticRegression()
final_params_logreg = {'C': 0.08858667904100823, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
logreg_model.set_params(**final_params_logreg)
logreg_model.fit(st1_data_x_train, st1_data_y_train)
preds = logreg_model.predict(st1_data_x_test)
brier_score_loss(st1_data_y_test, preds)

0.299071862921293

In [16]:
pickle.dump(logreg_model, open('logreg_model.pkl', "wb"))

#### Gaussian NB Tuned + KFold

In [24]:
%%time
params_gnb = {'var_smoothing': np.logspace(0,-9, num=1000)}
gnb = Model_Data(GaussianNB(), st1_data_x, st1_data_y, scoring = 'neg_brier_score')
gnb.split(test_size=0.3)
gnb.tune(params_gnb)
gnb.fit()
preds = gnb.predict()
gnb.AccuracyReport(preds)

Fitting 2 folds for each of 1000 candidates, totalling 2000 fits
Best: -0.202248 using {'var_smoothing': 0.0012052609368708425}
[[10827  4777]
 [ 4775 10880]]
              precision    recall  f1-score   support

           0       0.69      0.69      0.69     15604
           1       0.69      0.69      0.69     15655

    accuracy                           0.69     31259
   macro avg       0.69      0.69      0.69     31259
weighted avg       0.69      0.69      0.69     31259

0.6944230880861527
0.30557599411369524
CPU times: total: 31.2 s
Wall time: 34.3 s


In [7]:
gnb_model = GaussianNB(var_smoothing=0.0012052609368708425)
gnb_model.fit(st1_data_x_train, st1_data_y_train)
preds = gnb_model.predict(st1_data_x_test)
brier_score_loss(st1_data_y_test, preds)

0.30089446811777804

In [8]:
pickle.dump(gnb_model, open('gnb_model.pkl', "wb"))

#### Random Forest Simple

In [6]:
%%time
rf = Model_Data(RandomForestClassifier(), st1_data_x, st1_data_y, 'neg_brier_score')
rf.split(test_size=0.3)
rf.tune(params_rf)
rf.fit()
preds = rf.predict()
rf.AccuracyReport(preds)

Fitting 2 folds for each of 144 candidates, totalling 288 fits
Best: -0.195267 using {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 1600}
[[10878  4745]
 [ 4724 10912]]
              precision    recall  f1-score   support

           0       0.70      0.70      0.70     15623
           1       0.70      0.70      0.70     15636

    accuracy                           0.70     31259
   macro avg       0.70      0.70      0.70     31259
weighted avg       0.70      0.70      0.70     31259

0.697078909395363
0.3029207588214594
CPU times: total: 17min 4s
Wall time: 2h 9min 6s


In [20]:
final_rf_model = RandomForestClassifier()
final_params = {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 1600}
final_rf_model.set_params(**final_params)
final_rf_model.fit(st1_data_x_train, st1_data_y_train)
preds = final_rf_model.predict(st1_data_x_test)
brier_score_loss(st1_data_y_test, preds)

0.3045414411301777

#### XGBoost Tuned

In [18]:
dtrain = xgb.DMatrix(data=st1_data_x_train, label = st1_data_y_train)
params_xgb = {
    "learning_rate" : [0.1, 0.25, 0.3] ,
    "n_estimators": [50, 100, 150],
    "max_depth": [3, 4, 5],
    "min_child_weight" : [1, 3, 5],
    "colsample_bytree" : [0.5, 0.7, 0.8]
}
xgb_model = xgb.XGBClassifier()
cv = KFold(n_splits=3)
xgb_tune = GridSearchCV(estimator=xgb_model, param_grid=params_xgb, cv=cv, scoring='neg_brier_score', verbose=2)
xgb_tune.fit(st1_data_x_train, st1_data_y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50; total time=   0.6s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50; total time=   0.6s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50; total time=   0.6s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100; total time=   1.2s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100; total time=   1.2s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100; total time=   1.2s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=150; total time=   1.7s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=150;

[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100; total time=   2.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=150; total time=   2.9s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=150; total time=   2.9s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=150; total time=   2.9s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=50; total time=   1.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=50; total time=   1.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=50; total time=   1.0s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100; total time=   2.0s
[CV] END colsample_bytree=0.5, learning_rat

[CV] END colsample_bytree=0.5, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=50; total time=   1.0s
[CV] END colsample_bytree=0.5, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=50; total time=   1.0s
[CV] END colsample_bytree=0.5, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=100; total time=   2.0s
[CV] END colsample_bytree=0.5, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=100; total time=   2.0s
[CV] END colsample_bytree=0.5, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=100; total time=   2.0s
[CV] END colsample_bytree=0.5, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=150; total time=   3.0s
[CV] END colsample_bytree=0.5, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=150; total time=   3.0s
[CV] END colsample_bytree=0.5, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=150; total time=   3.0s
[CV] END colsample_bytree=0.5, lea

[CV] END colsample_bytree=0.5, learning_rate=0.3, max_depth=4, min_child_weight=3, n_estimators=150; total time=   2.3s
[CV] END colsample_bytree=0.5, learning_rate=0.3, max_depth=4, min_child_weight=3, n_estimators=150; total time=   2.4s
[CV] END colsample_bytree=0.5, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=50; total time=   0.8s
[CV] END colsample_bytree=0.5, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=50; total time=   0.8s
[CV] END colsample_bytree=0.5, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=50; total time=   0.8s
[CV] END colsample_bytree=0.5, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=100; total time=   1.5s
[CV] END colsample_bytree=0.5, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=100; total time=   1.6s
[CV] END colsample_bytree=0.5, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=100; total time=   1.6s
[CV] END colsample_bytree=0.5, learning_rat

[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100; total time=   1.7s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100; total time=   1.6s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=150; total time=   2.4s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=150; total time=   2.4s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=150; total time=   2.4s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=4, min_child_weight=3, n_estimators=50; total time=   0.8s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=4, min_child_weight=3, n_estimators=50; total time=   0.8s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=4, min_child_weight=3, n_estimators=50; total time=   0.8s
[CV] END colsample_bytree=0.7, learning_rat

[CV] END colsample_bytree=0.7, learning_rate=0.25, max_depth=3, min_child_weight=5, n_estimators=50; total time=   0.6s
[CV] END colsample_bytree=0.7, learning_rate=0.25, max_depth=3, min_child_weight=5, n_estimators=50; total time=   0.6s
[CV] END colsample_bytree=0.7, learning_rate=0.25, max_depth=3, min_child_weight=5, n_estimators=100; total time=   1.2s
[CV] END colsample_bytree=0.7, learning_rate=0.25, max_depth=3, min_child_weight=5, n_estimators=100; total time=   1.2s
[CV] END colsample_bytree=0.7, learning_rate=0.25, max_depth=3, min_child_weight=5, n_estimators=100; total time=   1.2s
[CV] END colsample_bytree=0.7, learning_rate=0.25, max_depth=3, min_child_weight=5, n_estimators=150; total time=   1.9s
[CV] END colsample_bytree=0.7, learning_rate=0.25, max_depth=3, min_child_weight=5, n_estimators=150; total time=   1.8s
[CV] END colsample_bytree=0.7, learning_rate=0.25, max_depth=3, min_child_weight=5, n_estimators=150; total time=   1.8s
[CV] END colsample_bytree=0.7, lea

[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, min_child_weight=1, n_estimators=150; total time=   1.8s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, min_child_weight=1, n_estimators=150; total time=   1.8s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, min_child_weight=1, n_estimators=150; total time=   1.8s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, min_child_weight=3, n_estimators=50; total time=   0.6s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, min_child_weight=3, n_estimators=50; total time=   0.6s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, min_child_weight=3, n_estimators=50; total time=   0.6s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, min_child_weight=3, n_estimators=100; total time=   1.2s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, min_child_weight=3, n_estimators=100; total time=   1.2s
[CV] END colsample_bytree=0.7, learning_rat

[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=100; total time=   2.1s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=100; total time=   2.0s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=100; total time=   2.0s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=150; total time=   3.0s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=150; total time=   3.0s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=150; total time=   3.0s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50; total time=   0.7s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50; total time=   0.6s
[CV] END colsample_bytree=0.8, learning_ra

[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=50; total time=   1.1s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=50; total time=   1.1s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=50; total time=   1.1s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100; total time=   2.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100; total time=   2.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100; total time=   2.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=150; total time=   3.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=150; total time=   3.2s
[CV] END colsample_bytree=0.8, learning_rat

[CV] END colsample_bytree=0.8, learning_rate=0.25, max_depth=4, min_child_weight=5, n_estimators=150; total time=   2.6s
[CV] END colsample_bytree=0.8, learning_rate=0.25, max_depth=4, min_child_weight=5, n_estimators=150; total time=   2.5s
[CV] END colsample_bytree=0.8, learning_rate=0.25, max_depth=4, min_child_weight=5, n_estimators=150; total time=   2.5s
[CV] END colsample_bytree=0.8, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=50; total time=   1.1s
[CV] END colsample_bytree=0.8, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=50; total time=   1.0s
[CV] END colsample_bytree=0.8, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=50; total time=   1.1s
[CV] END colsample_bytree=0.8, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=100; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.25, max_depth=5, min_child_weight=1, n_estimators=100; total time=   2.1s
[CV] END colsample_bytree=0.8, lear

[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=4, min_child_weight=3, n_estimators=100; total time=   1.7s
[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=4, min_child_weight=3, n_estimators=100; total time=   1.7s
[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=4, min_child_weight=3, n_estimators=100; total time=   1.7s
[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=4, min_child_weight=3, n_estimators=150; total time=   2.6s
[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=4, min_child_weight=3, n_estimators=150; total time=   2.5s
[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=4, min_child_weight=3, n_estimators=150; total time=   2.5s
[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=50; total time=   0.9s
[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=50; total time=   0.8s
[CV] END colsample_bytree=0.8, learning_ra

In [19]:
xgb_tune.best_score_, xgb_tune.best_params_

(-0.19387154914174107,
 {'colsample_bytree': 0.8,
  'learning_rate': 0.1,
  'max_depth': 4,
  'min_child_weight': 5,
  'n_estimators': 100})

In [20]:
preds = xgb_tune.predict(st1_data_x_test)
print(brier_score_loss(st1_data_y_test, preds))
#print(classification_report(st1_data_y_test, preds))

0.299687338437677


In [21]:
dtrain = xgb.DMatrix(data=st1_data_x_train, label = st1_data_y_train)
dtest = xgb.DMatrix(data=st1_data_x_test, label = st1_data_y_test)
final_params_xgb = {'colsample_bytree': 0.8, 'learning_rate': 0.1, 
                    'max_depth': 4, 'min_child_weight': 5, 'n_estimators': 100}
xgb_model = xgb.train(final_params_xgb, dtrain)
preds = xgb_model.predict(dtest)
brier_score_loss(st1_data_y_test, preds)

Parameters: { "n_estimators" } are not used.



0.20903718571323246

In [22]:
pickle.dump(xgb_model, open('xgb_model.pkl', "wb"))

In [23]:
xgb.cv(final_params_xgb, dtrain, num_boost_round=1000, nfold=5, metrics=['logloss'])

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.



Unnamed: 0,train-logloss-mean,train-logloss-std,test-logloss-mean,test-logloss-std
0,0.674133,0.000127,0.674421,0.000056
1,0.662019,0.002881,0.662473,0.002918
2,0.651156,0.003676,0.651916,0.003549
3,0.641003,0.003489,0.642075,0.003203
4,0.630401,0.002883,0.631725,0.002618
...,...,...,...,...
995,0.480128,0.000631,0.622480,0.013001
996,0.480063,0.000639,0.622493,0.012986
997,0.479975,0.000633,0.622890,0.013136
998,0.479869,0.000625,0.622885,0.013129
