In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import pickle
import os

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

import xgboost as xgb
from xgboost import plot_importance

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 125)

optuna.logging.set_verbosity(optuna.logging.WARNING)

In [3]:
model_path_name = "base_model_per_player"
DIRECTORY_PATH = f"models/{model_path_name}"
MODEL_INFO_FILEPATH = f"{DIRECTORY_PATH}/info.txt"

In [24]:
if not os.path.exists(DIRECTORY_PATH):
    os.makedirs(DIRECTORY_PATH)
with open(MODEL_INFO_FILEPATH, "w") as text_file:
    print(f"##### MODEL INFO #####", file=text_file)
    print(f"- Here we are creating a model for every pitcher with at least 500 pitches \n ", file=text_file)

In [5]:
dataset = pd.read_pickle("data/dataset.pkl")
MIN_SIZE = 500

In [6]:
print(dataset.shape[0])
print(dataset.shape[0]*.80)
print(dataset.shape[0]*.10)

711841
569472.8
71184.1


In [7]:
dataset = dataset.groupby('pitcher_id').filter(lambda x: x.shape[0] >= MIN_SIZE)

In [25]:
with open(MODEL_INFO_FILEPATH, "a") as text_file:
    print(f"Features used = {dataset.columns.to_list()}", file=text_file)

In [8]:
print(dataset.shape[0])
print(f"Num unique pitchers = {len(dataset.pitcher_id.unique())}")

657015
Num unique pitchers = 406


In [9]:
def objective(trial, train_X, train_y, val_X, val_y):
    param = {"max_depth":    trial.suggest_categorical('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]),
              "learning_rate": trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
              "n_estimators": trial.suggest_int('n_estimators', 100, 1000,step=100),
              "subsample" : trial.suggest_float('subsample', 0.1, 1, step=0.1),
              "min_child_weight" : trial.suggest_int('min_child_weight', 1, 10, step=1), 
              "colsample_bytree" : trial.suggest_float('subsample', 0.1, 1, step=0.1),
            }
    
    clf = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, early_stopping_rounds=10,
                            objective='multi:softmax', eval_metric=['merror','mlogloss'], **param)
    clf.fit(train_X, train_y, eval_set=[(val_X, val_y)], verbose=0)
    
    best_merror = clf.evals_result()['validation_0']['merror'][clf.best_iteration]
    return best_merror


In [10]:
''' 
'model_1' : {   'model' : clf,
                'merror' : val,
                'mlogloss' : val,
                }
'''
info_dict = dict()
stats_df = pd.DataFrame(columns=['merror', 'mlogloss'])

In [12]:

''' 
pitcher_id : id of pitcher to use for model
info_dict : where to store model and stats
'''
def train_individual_model(pitcher_id, full_dataset, info_dict, stats_df):
    pitcher_dataset = full_dataset.query("pitcher_id == @pitcher_id")

    train_index_stop = int(pitcher_dataset.shape[0]*0.8)
    val_index_stop = train_index_stop + (pitcher_dataset.shape[0]-train_index_stop)//2

    training_set = pitcher_dataset.iloc[0:train_index_stop, :]
    validation_set = pitcher_dataset.iloc[train_index_stop:val_index_stop, :]
    test_set = pitcher_dataset.iloc[val_index_stop:,]

    train_X = training_set.drop(["uid", "pitch_type", "type_confidence", "pitcher_id"],axis=1)
    train_y = training_set['pitch_type']

    val_X = validation_set.drop(["uid", "pitch_type", "type_confidence", "pitcher_id"],axis=1)
    val_y = validation_set['pitch_type']

    #test_X = test_set.drop(["uid", "pitch_type", "type_confidence", "pitcher_id"],axis=1)
    #test_y = test_set['pitch_type']

    encoder=LabelEncoder()
    train_y = encoder.fit_transform(train_y)    # xgboost labels need to be 0,...,n-1, so can't have same labels for all models bc some pitchers don't have all labels
    val_y = encoder.fit_transform(val_y)
    #test_y = encoder.fit_transform(test_y)

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, train_X, train_y, val_X, val_y), n_trials=10)
    #print(study.best_trial)
    clf = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, early_stopping_rounds=10,
                                            objective='objective=multi:softmax', eval_metric=['merror','mlogloss'], **study.best_params)
    clf.fit(train_X, train_y, eval_set=[(val_X, val_y)], verbose=0)
    best_val_merror = clf.evals_result()['validation_0']['merror'][clf.best_iteration]
    best_val_mlogloss = clf.evals_result()['validation_0']['mlogloss'][clf.best_iteration]

    model_dict = {
        'model_params' : study.best_params,
        'merror' : best_val_merror, 
        'mlogloss' : best_val_mlogloss,
        'n' : pitcher_dataset.shape[0]
    }
    info_dict[f"model_{pitcher_id}"] = model_dict
    stats_df.loc[pitcher_id] = [best_val_merror,best_val_mlogloss]

    num_wrong = best_val_merror*val_X.shape[0]
    return num_wrong, val_X.shape[0]

#print(f"Splits = train n ={train_index_stop}, val n = {val_index_stop}")

In [13]:
num_wrong = 0
num_seen = 0
for id in dataset.pitcher_id.unique():
    try:
        val_num_wrong, val_num = train_individual_model(id, dataset, info_dict, stats_df)
        num_wrong += val_num_wrong
        num_seen += val_num
    except Exception as e:
        print(f"Failed at id {id}")
final_val_error = num_wrong/num_seen

[W 2024-04-20 19:00:17,419] Trial 0 failed with parameters: {'max_depth': 2, 'learning_rate': 0.0213596670188526, 'n_estimators': 700, 'subsample': 0.5, 'min_child_weight': 8} because of the following error: XGBoostError('value 0 for Parameter num_class should be greater equal to 1\nnum_class: Number of output class in the multi-class classification.').
Traceback (most recent call last):
  File "/Users/scottmaran/.pyenv/versions/3.8.18/envs/swish_bb/lib/python3.8/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/_w/4wq7jf9n3c74v00gxl_2v81r0000gn/T/ipykernel_47812/1150918720.py", line 30, in <lambda>
    study.optimize(lambda trial: objective(trial, train_X, train_y, val_X, val_y), n_trials=10)
  File "/var/folders/_w/4wq7jf9n3c74v00gxl_2v81r0000gn/T/ipykernel_47812/689406351.py", line 12, in objective
    clf.fit(train_X, train_y, eval_set=[(val_X, val_y)], verbose=0)
  File "/Users/scottmaran/.pyenv/versions/3.8.18

Failed at id 121250


[W 2024-04-20 19:02:27,855] Trial 0 failed with parameters: {'max_depth': 5, 'learning_rate': 0.05621582367079576, 'n_estimators': 1000, 'subsample': 0.2, 'min_child_weight': 8} because of the following error: XGBoostError("[19:02:27] /Users/runner/work/xgboost/xgboost/src/metric/multiclass_metric.cu:35: Check failed: label_error >= 0 && label_error < static_cast<int32_t>(n_class): MultiClassEvaluation: label must be in [0, num_class), num_class=4 but found 4 in label\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x00000001664f0994 dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000166671920 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CheckLabelError(int, unsigned long) const + 204\n  [bt] (2) 3   libxgboost.dylib                    0x00000001666717c0 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CpuReduceMetrics(xgboost::HostDeviceVector<float

Failed at id 433586


[W 2024-04-20 19:06:07,844] Trial 0 failed with parameters: {'max_depth': 6, 'learning_rate': 0.32659315646003684, 'n_estimators': 900, 'subsample': 0.4, 'min_child_weight': 3} because of the following error: XGBoostError("[19:06:07] /Users/runner/work/xgboost/xgboost/src/metric/multiclass_metric.cu:35: Check failed: label_error >= 0 && label_error < static_cast<int32_t>(n_class): MultiClassEvaluation: label must be in [0, num_class), num_class=3 but found 3 in label\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x00000001664f0994 dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000166671920 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CheckLabelError(int, unsigned long) const + 204\n  [bt] (2) 3   libxgboost.dylib                    0x00000001666717c0 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CpuReduceMetrics(xgboost::HostDeviceVector<float>

Failed at id 121125


[W 2024-04-20 19:12:27,646] Trial 0 failed with parameters: {'max_depth': 9, 'learning_rate': 0.35343152850445647, 'n_estimators': 900, 'subsample': 0.5, 'min_child_weight': 4} because of the following error: XGBoostError("[19:12:27] /Users/runner/work/xgboost/xgboost/src/metric/multiclass_metric.cu:35: Check failed: label_error >= 0 && label_error < static_cast<int32_t>(n_class): MultiClassEvaluation: label must be in [0, num_class), num_class=5 but found 5 in label\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x00000001664f0994 dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000166671920 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CheckLabelError(int, unsigned long) const + 204\n  [bt] (2) 3   libxgboost.dylib                    0x00000001666717c0 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CpuReduceMetrics(xgboost::HostDeviceVector<float>

Failed at id 502188


[W 2024-04-20 19:17:31,756] Trial 0 failed with parameters: {'max_depth': 2, 'learning_rate': 0.25612958593104845, 'n_estimators': 200, 'subsample': 0.6, 'min_child_weight': 2} because of the following error: XGBoostError("[19:17:31] /Users/runner/work/xgboost/xgboost/src/metric/multiclass_metric.cu:35: Check failed: label_error >= 0 && label_error < static_cast<int32_t>(n_class): MultiClassEvaluation: label must be in [0, num_class), num_class=4 but found 4 in label\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x00000001664f0994 dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000166671920 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CheckLabelError(int, unsigned long) const + 204\n  [bt] (2) 3   libxgboost.dylib                    0x00000001666717c0 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CpuReduceMetrics(xgboost::HostDeviceVector<float>

Failed at id 493157


[W 2024-04-20 19:29:37,970] Trial 0 failed with parameters: {'max_depth': 6, 'learning_rate': 0.2233756411979994, 'n_estimators': 300, 'subsample': 1.0, 'min_child_weight': 1} because of the following error: XGBoostError("[19:29:37] /Users/runner/work/xgboost/xgboost/src/metric/multiclass_metric.cu:35: Check failed: label_error >= 0 && label_error < static_cast<int32_t>(n_class): MultiClassEvaluation: label must be in [0, num_class), num_class=5 but found 5 in label\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x00000001664f0994 dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000166671920 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CheckLabelError(int, unsigned long) const + 204\n  [bt] (2) 3   libxgboost.dylib                    0x00000001666717c0 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CpuReduceMetrics(xgboost::HostDeviceVector<float> 

Failed at id 430083


[W 2024-04-20 19:30:52,925] Trial 0 failed with parameters: {'max_depth': 6, 'learning_rate': 0.03929460554960038, 'n_estimators': 600, 'subsample': 0.30000000000000004, 'min_child_weight': 4} because of the following error: XGBoostError("[19:30:52] /Users/runner/work/xgboost/xgboost/src/metric/multiclass_metric.cu:35: Check failed: label_error >= 0 && label_error < static_cast<int32_t>(n_class): MultiClassEvaluation: label must be in [0, num_class), num_class=5 but found 5 in label\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x00000001664f0994 dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000166671920 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CheckLabelError(int, unsigned long) const + 204\n  [bt] (2) 3   libxgboost.dylib                    0x00000001666717c0 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CpuReduceMetrics(xgboost::HostDev

Failed at id 450729


[W 2024-04-20 19:33:34,603] Trial 0 failed with parameters: {'max_depth': 7, 'learning_rate': 0.16466806857349334, 'n_estimators': 400, 'subsample': 0.1, 'min_child_weight': 1} because of the following error: XGBoostError("[19:33:34] /Users/runner/work/xgboost/xgboost/src/metric/multiclass_metric.cu:35: Check failed: label_error >= 0 && label_error < static_cast<int32_t>(n_class): MultiClassEvaluation: label must be in [0, num_class), num_class=5 but found 5 in label\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x00000001664f0994 dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000166671920 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CheckLabelError(int, unsigned long) const + 204\n  [bt] (2) 3   libxgboost.dylib                    0x00000001666717c0 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CpuReduceMetrics(xgboost::HostDeviceVector<float>

Failed at id 450203


[W 2024-04-20 19:40:07,047] Trial 0 failed with parameters: {'max_depth': 4, 'learning_rate': 0.06220995635935659, 'n_estimators': 800, 'subsample': 0.30000000000000004, 'min_child_weight': 8} because of the following error: XGBoostError("[19:40:07] /Users/runner/work/xgboost/xgboost/src/metric/multiclass_metric.cu:35: Check failed: label_error >= 0 && label_error < static_cast<int32_t>(n_class): MultiClassEvaluation: label must be in [0, num_class), num_class=4 but found 4 in label\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x00000001664f0994 dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000166671920 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CheckLabelError(int, unsigned long) const + 204\n  [bt] (2) 3   libxgboost.dylib                    0x00000001666717c0 xgboost::metric::MultiClassMetricsReduction<xgboost::metric::EvalMatchError>::CpuReduceMetrics(xgboost::HostDev

Failed at id 452733


In [15]:
pickle.dump(info_dict, open(f"{DIRECTORY_PATH}/model_info_dict.pkl", "wb"))
pickle.dump(stats_df, open(f"{DIRECTORY_PATH}/stats_df.pkl", "wb"))

In [27]:
with open(MODEL_INFO_FILEPATH, "a") as text_file:
    print(f"final_val_error = {final_val_error}", file=text_file)

In [17]:
stats_df.sort_values('merror')

Unnamed: 0,merror,mlogloss
543766,0.075472,0.445120
407878,0.122137,0.428831
430629,0.127451,0.535741
501745,0.150000,0.545979
488674,0.210000,0.623660
...,...,...
502706,0.977716,1.885805
452741,0.981651,1.421439
543070,1.000000,2.095917
434180,1.000000,1.703830
