In [12]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (
    HistGradientBoostingClassifier,
    GradientBoostingClassifier
)

from sklearn.model_selection import cross_val_score, TimeSeriesSplit

from xgboost import XGBClassifier

In [13]:
processed = '../data/processed'
main_data = pd.read_pickle(Path(processed) / 'main_data.pkl')

In [14]:
main_data = main_data[(main_data.b_prev_G > 50)]

train = main_data[(main_data.year < 2000) & (main_data.year >= 1960)]

In [15]:
x_vars = [
    'spot', 'home', 'b_pred_HPPA', 'p_pred_HPAB', 'park_factor', 'year',
    'BAT_HAND', 'PIT_HAND', 'b_avg_win', 'own_p_pred_HPAB',
    'p_team_HPG', 'p_team_avg_game_score', 'rating_rating_pre',
    'rating_rating_prob', 'rating_pitcher_rgs',
    'rating_own_rating_pre', 'rating_own_pitcher_rgs'
]

In [16]:
clf = XGBClassifier(
    tree_method='hist',
    verbosity = 0,
    random_state = 0,
    eval_metric = 'auc',
    max_depth = 1,
    learning_rate = .0005,
    n_estimators = 900,
)

In [17]:
# clf = XGBClassifier(
#     tree_method='hist',
#     verbosity = 0,
#     random_state = 0,
#     eval_metric = 'auc',
#     max_depth = 1,
#     learning_rate = .01,
#     subsample=.5,
#     colsample_bytree=.4,
#     # min_leaf_weight = 7.5,
#     use_label_encoder=False,
#     n_estimators = 100,
# )

In [18]:
# clf = HistGradientBoostingClassifier(
#     loss = 'binary_crossentropy',
#     random_state=0, 
#     learning_rate = .1,
#     max_depth=1,
#     max_iter = 400,
# )

In [19]:
results = cross_val_score(clf, train[x_vars], train['Win'].astype('int'), cv=5, n_jobs=-1)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.65896304 0.6589583  0.6589583  0.6589583  0.65896143]
Accuracy: 65.896% (0.000%)


In [20]:
results = cross_val_score(clf, train[x_vars], train['Win'].astype('int'), cv=5, scoring='roc_auc', n_jobs=-1)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.52016026 0.5        0.4950359  0.5155655  0.5       ]
Accuracy: 50.615% (0.984%)


In [21]:
results = cross_val_score(clf, train[x_vars], train['Win'].astype('int'), cv=10, scoring='roc_auc', n_jobs=-1)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.52400019 0.51822678 0.55428286 0.5        0.49278686 0.52325383
 0.52150137 0.56318977 0.5        0.5       ]
Accuracy: 51.972% (2.239%)
