In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import (SGDClassifier)

from sklearn.model_selection import cross_val_score, TimeSeriesSplit

from xgboost import XGBClassifier

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    PolynomialFeatures,
    StandardScaler
)

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    GridSearchCV,
    HalvingGridSearchCV,
    RepeatedStratifiedKFold, 
    StratifiedKFold
)

In [2]:
processed = '../data/processed'
main_data = pd.read_pickle(Path(processed) / 'main_data.pkl')

In [3]:
main_data = main_data[(main_data.b_prev_G > 50)]

x_vars = [
    'spot', 'home', 'b_pred_HPPA', 'p_pred_HPAB', 'park_factor', 'year',
    'BAT_HAND', 'PIT_HAND', 'b_avg_win', 'own_p_pred_HPAB',
    'p_team_HPG', 'p_team_avg_game_score', 'rating_rating_pre',
    'rating_rating_prob', 'rating_pitcher_rgs',
    'rating_own_rating_pre', 'rating_own_pitcher_rgs'
]
y_var = ['Win']
vars = x_vars + y_var

main_data = main_data.loc[:, vars]
main_data = main_data.dropna()

train = main_data[(main_data.year < 2000) & (main_data.year >= 1990)]
test = main_data[(main_data.year >= 2000)]

train_x = train[x_vars]
train_y = train['Win'].astype('int')
test_x = test[x_vars]
test_y = test['Win'].astype('int')

In [4]:
preprocessor =  ColumnTransformer(
    [('spot', 'passthrough', x_vars)],
    remainder='drop'
)

sgd = SGDClassifier(
    loss='log',
    penalty='l1',
    random_state=0,
    max_iter = 1000,
    alpha = .0005, 
)

clf = XGBClassifier(
    tree_method='hist',
    verbosity = 0,
    random_state = 0,
    eval_metric = 'auc',
    max_depth = 1,
    learning_rate = .03,
    early_stopping_rounds=5,
    n_estimators = 1000,
    subsample=0.9,
    colsample_bynode=0.33,
    use_label_encoder=False,
    verbose=True,
)

fitted_model = Pipeline([
    ('select', preprocessor),
    ('poly', PolynomialFeatures(2, interaction_only=True)),
    ('scale', StandardScaler()),
#     ('feature_selection', SelectFromModel(sgd)),
    ('clf', clf),
])

params = {
    "clf__learning_rate": [.1, .03, .02, .01, .001],
#     "feature_selection__estimator__alpha": [.0001, .0003, .00001, .000001],
}

inner_cv = StratifiedKFold(shuffle = True, random_state=1)
gs = HalvingGridSearchCV(
    fitted_model, params, refit='AUC', cv=inner_cv,
    scoring='roc_auc', verbose = 10
)

In [None]:
# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
outer_cv = StratifiedKFold(shuffle=True, random_state=1)
results = cross_val_score(gs, train, train['Win'].astype('int'), cv=outer_cv, n_jobs=-1, scoring='roc_auc')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

In [None]:
gs = HalvingGridSearchCV(
    fitted_model, params, refit='AUC', cv=inner_cv,
    scoring='roc_auc', n_jobs=-1
)
gs.fit(train, train['Win'].astype('int'))

In [None]:
y_pred = gs.predict_proba(test)[:, 1]
y_true = test['Win'].astype('int')
print(roc_auc_score(y_true, y_pred))

In [None]:
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1)

In [None]:
plt.plot(fpr, tpr, tpr, tpr)

In [None]:
# mask = gs.named_steps['feature_selection'].get_support()

# new_features = []
# for bool, feature in zip(mask, x_vars):
#     if bool:
#         new_features.append(feature)
  

# print(new_features)
# import numpy as np
# perc = round(np.sqrt(len(new_features)) / len(new_features), 2)
# print(str(perc*100) + '%')

In [None]:
grid_results = gs.cv_results_

In [None]:
AUC = grid_results['mean_test_score']
AUC

In [None]:
param_dict = {}
for key, value in grid_results.items():
    if 'param_clf__' in key:
        name = key.replace('param_clf__', '')
        new = np.array(np.ma.getdata(grid_results[key]), dtype=np.float)
        param_dict[name] = new
        print(name)
        plt.figure()
        plt.scatter(new, AUC)
        plt.show()

    if 'param_feature_selection__' in key:
        name = key.replace('param_clf__', '')
        new = np.array(np.ma.getdata(grid_results[key]), dtype=np.float)
        param_dict[name] = new
        print(name)
        plt.figure()
        plt.scatter(new, AUC)
        plt.show()

param_dict

In [None]:
print("Best: %f using %s" % (gs.best_score_, 
    gs.best_params_))

In [22]:
eval_set = [(train[x_vars], train['Win'].astype('int')), (test[x_vars], test['Win'].astype('int'))]
fitted_model.fit(train, train['Win'].astype('int'), clf__eval_set=eval_set)

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153'] ['spot', 'home', 'b_pred_HPPA', 'p_pred_HPAB', 'park_factor', 'year', 'BAT_HAND', 'PIT_HAND', 'b_avg_win', 'own_p_pred_HPAB', 'p_team_HPG', 'p_team_avg_game_score', 'rating_rating_pre', 'rating_rating_prob', 'rating_pitcher_rgs', 'rating_own_rating_pre', 'rating_own_pitcher_rgs']
expected f1, f137, f33, f38, f41, f100, f107, f40, f51, f18, f115, f102, f16, f86, f99, f88, f52, f128, f27, f93, f77, f74, f75, f129, f145, f110, f54, f45, f9, f55, f140, f20, f59, f106, f135, f104, f63, f98, f61, f28, f82, f146, f67, f121, f10, f78, f26, f84, f130, f70, f68, f109, f139, f17, f48, f58, f117, f3, f96, f90, f12, f152, f127, f151, f43, f73, f108, f141, f138, f46, f13, f81, f71, f116, f79, f7, f25, f111, f57, f92, f5, f19, f32, f37, f42, f62, f113, f21, f149, f101, f23, f126, f29, f34, f103, f134, f122, f91, f87, f64, f76, f144, f142, f15, f143, f0, f105, f14, f132, f118, f65, f133, f6, f2, f44, f36, f120, f89, f119, f39, f22, f153, f148, f83, f114, f125, f47, f30, f31, f124, f49, f72, f60, f11, f35, f8, f112, f95, f85, f97, f56, f136, f150, f131, f4, f50, f80, f53, f94, f123, f66, f69, f147, f24 in input data
training data did not have the following fields: b_pred_HPPA, p_pred_HPAB, b_avg_win, home, p_team_HPG, spot, own_p_pred_HPAB, p_team_avg_game_score, rating_pitcher_rgs, rating_own_pitcher_rgs, park_factor, rating_rating_pre, year, PIT_HAND, BAT_HAND, rating_own_rating_pre, rating_rating_prob

In [7]:
y_pred = fitted_model.predict_proba(test)[:, 1]
y_true = test['Win'].astype('int')
print(roc_auc_score(y_true, y_pred))

0.56555846518067


In [9]:
results = fitted_model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)

AttributeError: 'Pipeline' object has no attribute 'evals_result'

In [18]:
fitted_model.named_steps['clf'].evals_result

AttributeError: 'XGBClassifier' object has no attribute 'evals_result_'