In [7]:
import mlb_metrics_helpers

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [10]:
# get batter data
judge_id = mlb_metrics_helpers.player_id("Judge", "Aaron")
judge_general_stats = mlb_metrics_helpers.player_general_metrics(judge_id, timeline_type="career")
judge_timeline = mlb_metrics_helpers.parse_career_timeline(judge_general_stats)

judge_specific_stats = mlb_metrics_helpers.player_specific_metrics(
    judge_id,
    metric_type="batting",
    start_dt=judge_timeline[0],
    end_dt=judge_timeline[1],
)
judge_specific_stats

Gathering Player Data


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,CU,2022-08-12,77.0,-2.27,5.63,"Judge, Aaron",592450,543135,strikeout,swinging_strike,...,1,2,1,2,1,Standard,Standard,41.0,0.023,-0.195
1,CU,2022-08-12,76.8,-2.09,5.59,"Judge, Aaron",592450,543135,,ball,...,1,2,1,2,1,Standard,Standard,47.0,0.000,0.037
2,FC,2022-08-12,89.3,-2.27,5.43,"Judge, Aaron",592450,543135,,ball,...,1,2,1,2,1,Standard,Standard,187.0,0.000,0.017
3,FS,2022-08-12,86.5,-2.38,5.41,"Judge, Aaron",592450,543135,,swinging_strike,...,1,2,1,2,1,Standard,Standard,238.0,0.000,-0.046
4,FF,2022-08-12,92.9,-2.36,5.40,"Judge, Aaron",592450,543135,,called_strike,...,1,2,1,2,1,Standard,Standard,223.0,0.000,-0.034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3128,SI,2022-08-13,92.5,-2.87,3.85,"Judge, Aaron",592450,670167,,foul,...,2,2,2,2,2,Standard,Standard,267.0,0.000,-0.042
3129,SL,2022-08-13,86.8,-2.23,5.86,"Judge, Aaron",592450,676710,,ball,...,0,0,0,0,0,Infield shift,Standard,110.0,0.000,0.016
3130,SL,2022-08-13,83.1,-3.15,3.78,"Judge, Aaron",592450,670167,,ball,...,2,2,2,2,2,Standard,Standard,74.0,0.000,0.044
3131,FF,2022-08-13,96.1,-1.98,5.96,"Judge, Aaron",592450,676710,,called_strike,...,0,0,0,0,0,Infield shift,Standard,210.0,0.000,-0.024


In [117]:
col_to_keep = [
        "pitch_type",
        "release_speed",
        "release_pos_x",
        "release_pos_y",
        "release_spin_rate",
        "spin_axis",
        "p_throws",
        "plate_x",
        "plate_z",
        "vx0",
        "vy0",
        "vz0",
        "ax",
        "ay",
        "az",
        "description",
    ]

# Select only the relevant columns
model_data = judge_specific_stats[col_to_keep]
model_data.dropna(inplace=True)
model_data = model_data[(model_data["description"] == "hit_into_play") | (model_data["description"] == "swinging_strike")]
model_data

Unnamed: 0,plate_x,plate_z,description
0,0.82,1.44,swinging_strike
3,0.04,1.70,swinging_strike
5,-0.55,3.76,hit_into_play
35,0.09,2.22,hit_into_play
37,-0.28,3.72,swinging_strike
...,...,...,...
3102,0.61,1.46,swinging_strike
3107,0.50,1.99,hit_into_play
3114,-0.21,1.72,hit_into_play
3117,-0.52,2.02,swinging_strike


In [118]:
target_name = "description"

model_data.dropna(inplace=True)
target = model_data[target_name]
data = model_data.drop(columns=[target_name])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=0, test_size=0.1, stratify=target
)

In [122]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numerical_columns),
    ]
)


from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model_types = [RandomForestClassifier(random_state=0), LogisticRegression(max_iter=500), HistGradientBoostingClassifier()]
model = make_pipeline(preprocessor, model_types[0])

_ = model.fit(data_train, target_train)

model.score(data_test, target_test)

0.7350230414746544

In [120]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)

preprocessor = ColumnTransformer(
    [("categorical", categorical_preprocessor, categorical_columns)],
    remainder="passthrough",
)

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())

_ = model.fit(data_train, target_train)

model.score(data_test, target_test)

0.7488479262672811