In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (
    HistGradientBoostingClassifier,
    GradientBoostingClassifier
)

from sklearn.linear_model import (SGDClassifier)

from sklearn.model_selection import cross_val_score, TimeSeriesSplit

from xgboost import XGBRFClassifier

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [None]:
processed = '../data/processed'
main_data = pd.read_pickle(Path(processed) / 'main_data.pkl')

In [None]:
main_data = main_data[(main_data.b_prev_G > 50)]

x_vars = [
    'spot', 'home', 'b_pred_HPPA', 'p_pred_HPAB', 'park_factor', 'year',
    'BAT_HAND', 'PIT_HAND', 'b_avg_win', 'own_p_pred_HPAB',
    'p_team_HPG', 'p_team_avg_game_score', 'rating_rating_pre',
    'rating_rating_prob', 'rating_pitcher_rgs',
    'rating_own_rating_pre', 'rating_own_pitcher_rgs'
]
y_var = ['Win']
vars = x_vars + y_var

main_data = main_data.loc[:, vars]
main_data = main_data.dropna()

train = main_data[(main_data.year < 2000) & (main_data.year >= 1960)]
test = main_data[(main_data.year >= 2000)]

train_x = train[x_vars]
train_y = train['Win'].astype('int')
test_x = test[x_vars]
test_y = test['Win'].astype('int')

In [None]:
preprocessor =  ColumnTransformer(
    [('spot', 'passthrough', x_vars)],
    remainder='drop'
)

sgd = SGDClassifier(
    loss='log',
    penalty='l1',
    random_state=0,
    max_iter = 1000,
    alpha = .0001
)

clf = XGBRFClassifier(
    verbosity = 1,
    random_state = 0,
    n_estimators = 300,
    use_label_encoder=False,
    eval_metric='auc',
    colsample_bynode=0.3,
    subsample = .8,
)

fitted_model = Pipeline([
    ('select', preprocessor),
    ('poly', PolynomialFeatures(2, interaction_only=True)),
    ('scale', StandardScaler()),
    ('feature_selection', SelectFromModel(sgd)),
    ('clf', clf),
])

In [None]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
results = cross_val_score(fitted_model, train, train['Win'].astype('int'), cv=cv, n_jobs=-1, scoring='roc_auc')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

In [None]:
fitted_model.fit(train, train['Win'].astype('int'))

In [None]:
y_pred = fitted_model.predict_proba(test)[:, 1]
y_true = test['Win'].astype('int')
print(roc_auc_score(y_true, y_pred))

In [None]:
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1)

In [None]:
plt.plot(fpr, tpr, tpr, tpr)

In [None]:
mask = fitted_model.named_steps['feature_selection'].get_support()

new_features = []
for bool, feature in zip(mask, x_vars):
    if bool:
        new_features.append(feature)
  
import numpy as np
perc = round(np.sqrt(len(new_features)) / len(new_features), 2)
print(str(perc*100) + '%')

In [None]:
fitted_model.named_steps['feature_selection'].get_support().sum()