In [2]:
import pandas as pd
import numpy as np
import pybaseball as pb
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [3]:
start_end = pd.read_csv("start_end_dates.csv")

In [5]:
start_end.head()

Unnamed: 0,year,start_dt,end_dt
0,2023,2023-03-30,2023-10-01
1,2022,2022-04-07,2022-10-02
2,2021,2021-04-01,2021-10-03
3,2020,2020-07-23,2020-09-27
4,2019,2019-03-28,2019-09-29


In [4]:
dict = start_end.to_dict()
print(dict)

{'year': {0: 2023, 1: 2022, 2: 2021, 3: 2020, 4: 2019, 5: 2018, 6: 2017, 7: 2016, 8: 2015}, 'start_dt': {0: '2023-03-30', 1: '2022-04-07', 2: '2021-04-01', 3: '2020-07-23', 4: '2019-03-28', 5: '2018-03-29', 6: '2017-04-02', 7: '2016-04-03', 8: '2015-04-05'}, 'end_dt': {0: '2023-10-01', 1: '2022-10-02', 2: '2021-10-03', 3: '2020-09-27', 4: '2019-09-29', 5: '2018-09-30', 6: '2017-10-01', 7: '2016-10-02', 8: '2015-10-04'}}


In [None]:

from data_retrieval import get_season_data, get_fg_stats
from preprocess_data import preprocess_data, create_target_variable, get_sprint_speed
from train_models import train_model
from results_processing import calculate_expected_xwoba, postprocess_data


In [15]:
bbe_2021 = pd.read_csv("statcast_data/bbe/bbe_2021.csv")
bbe_2022 = pd.read_csv("statcast_data/bbe/bbe_2022.csv")
bbe_2023 = pd.read_csv("statcast_data/bbe/bbe_2023.csv")

bbe_all = pd.concat([bbe_2021, bbe_2022, bbe_2023])

In [23]:
X = bbe_all[['launch_speed', 'launch_angle', 'stand_L', 'sprint_speed', 'pull', 'oppo', 'if_fielding_alignment', 'of_fielding_alignment']]
y = bbe_all['woba_value'].values

In [24]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 424390 entries, 0 to 60405
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   launch_speed           424390 non-null  float64
 1   launch_angle           424390 non-null  float64
 2   stand_L                424390 non-null  int64  
 3   sprint_speed           424390 non-null  float64
 4   pull                   424390 non-null  int64  
 5   oppo                   424390 non-null  int64  
 6   if_fielding_alignment  424390 non-null  float64
 7   of_fielding_alignment  424390 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 29.1 MB


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
param_grid = {
    'learning_rate': [0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 5, 8],
    'gamma': [0, 0.1, 0.2]
}

xgb_regressor = XGBRegressor(tree_method='gpu_hist')

grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=3, verbose=2, n_jobs=-1)


In [27]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [28]:
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

Best parameters: {'gamma': 0, 'learning_rate': 0.2, 'max_depth': 8, 'n_estimators': 300}


In [29]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.1535119175697494


In [34]:
legacy_model = XGBRegressor(learning_rate=0.2, n_estimators=300, max_depth=8)

legacy_X = bbe_all[['launch_speed', 'launch_angle', 'sprint_speed', 'pull', 'oppo']]
legacy_y = bbe_all['woba_value'].values

In [35]:
legacy_X_train, legacy_X_test, legacy_y_train, legacy_y_test = train_test_split(legacy_X, legacy_y, test_size=0.2, random_state=42)

In [36]:
legacy_model.fit(legacy_X_train, legacy_y_train)

In [37]:
y_pred_legacy = legacy_model.predict(legacy_X_test)
mse_legacy = mean_squared_error(legacy_y_test, y_pred_legacy)
print(f"Mean Squared Error: {mse_legacy}")

Mean Squared Error: 0.15653714615238426


In [39]:
joblib.dump(legacy_model, f'models/w.joblib')

['models/w.joblib']