In [1]:
import numpy as np
import pandas as pd
import pybaseball as pb

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

import helperFunctions as hf
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
class DataFrameEncoder(TransformerMixin):

    def __init__(self):
        """Encode the data.

        Columns of data type object are appended in the list. After 
        appending Each Column of type object are taken dummies and 
        successively removed and two Dataframes are concated again.

        """
    def fit(self, X, y=None):
        self.object_col = []
        for col in X.columns:
            if(X[col].dtype == np.dtype('O')):
                self.object_col.append(col)
        return self

    def transform(self, X, y=None):
        dummy_df = pd.get_dummies(X[self.object_col],drop_first=False)
        X = X.drop(X[self.object_col],axis=1)
        X = pd.concat([dummy_df,X],axis=1)
        return X


In [3]:
pb.cache.enable()
pd.set_option("display.max_columns", None)

In [4]:
 pitching = pd.read_csv("data/lahman_1871-2024_csv/Pitching.csv")
 players = pd.read_csv("data/lahman_1871-2024_csv/People.csv")
 batting = pd.read_csv("data/lahman_1871-2024_csv/Batting.csv")
 appearances = pd.read_csv("data/lahman_1871-2024_csv/Appearances.csv")

In [5]:
pitchData = pb.statcast(start_dt="2024-03-01", end_dt="2025-10-31")

This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|████████████████████████████████████████████████████████████████████████████████| 477/477 [00:51<00:00,  9.26it/s]


In [6]:
# Look at pitch movement columns
mainDF = pitchData[['stand','game_date','player_name','events','description','p_throws','hit_location','bb_type','balls','strikes','pitch_type', 'release_speed', 'pfx_x', 'pfx_z', 'release_spin_rate','plate_x','plate_z','on_3b','on_2b','on_1b','outs_when_up','inning','inning_topbot','release_pos_y','at_bat_number','pitch_number','pitch_name','bat_score','fld_score','post_bat_score','post_fld_score', 'delta_home_win_exp','delta_run_exp','bat_speed','swing_length','pitcher_days_since_prev_game','n_thruorder_pitcher', 'arm_angle', 'batter','pitcher']]

In [7]:
ver = mainDF[mainDF['player_name'] == 'Verlander, Justin'].copy()

In [8]:
ver["events"].value_counts()

field_out                    470
strikeout                    221
single                       173
walk                          81
double                        58
home_run                      31
force_out                     12
sac_fly                       12
hit_by_pitch                  12
grounded_into_double_play     11
field_error                    8
triple                         3
truncated_pa                   3
sac_bunt                       3
fielders_choice                2
fielders_choice_out            2
double_play                    2
Name: events, dtype: int64

In [9]:
ver["pitch_success"] = np.where(
    (
        ver["events"].isin([
            'field_out', 'strikeout', 'field_error', 'fielders_choice',
            'double_play', 'grounded_into_double_play', 'force_out'
        ])
    ) |
    (
        ver["description"].isin(['called_strike', 'swinging_strike'])
    ) |
    (
        ver["description"].isin(['foul']) & (ver['strikes'] != 2)
    ),
    True,
    False
)

In [10]:
conditions = [
    ver["events"].isin(['double_play', 'grounded_into_double_play']),
    ver["events"].isin(['strikeout']), 
    ver["events"].isin(['field_out']),
    ver["description"].isin(['swinging_strike', 'called_strike']),
    ver["description"].isin(['foul', 'foul_tip']),
    ver["description"].isin(['ball']),
    ver["description"].isin(['hit_by_pitch']),
    ver["events"].isin(['single']),  # bad outcomes
    ver["events"].isin(['double']),  # bad outcomes
    ver["events"].isin(['triple']),  # bad outcomes
    ver["events"].isin(['home_run']),  # bad outcomes
]

values = [1.00, 0.95, 0.90, 0.75, 0.70, 0.50, 0.25, 0.25, 0.20, 0.15, 0]

ver["pitch_outcome_score"] = np.select(conditions, values, default=np.nan)

In [11]:
np.select(conditions, values, default=np.nan)

array([0.9 , 0.7 , 0.75, ..., 0.9 , 0.75, 0.5 ])

In [12]:
ver["on_3b"] = ver["on_3b"].fillna(0)
ver["on_2b"] = ver["on_2b"].fillna(0)
ver["on_1b"] = ver["on_1b"].fillna(0)

In [13]:
ver["pitch_success"].value_counts()

False    2213
True     2138
Name: pitch_success, dtype: int64

In [14]:
verClean = ver[['p_throws','stand','balls','strikes','pitch_type', 'release_speed', 'pfx_x', 'pfx_z', 'release_spin_rate','plate_x','plate_z','on_3b','on_2b','on_1b','outs_when_up','inning','release_pos_y','at_bat_number','pitch_number','pitch_name','bat_score','fld_score','pitcher_days_since_prev_game','n_thruorder_pitcher', 'arm_angle', 'batter', 'pitch_success','pitch_outcome_score']]

In [15]:
verClean = verClean.dropna()

In [16]:
train, test = train_test_split(verClean, stratify=verClean["pitch_success"])
train

Unnamed: 0,p_throws,stand,balls,strikes,pitch_type,release_speed,pfx_x,pfx_z,release_spin_rate,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,release_pos_y,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,pitcher_days_since_prev_game,n_thruorder_pitcher,arm_angle,batter,pitch_success,pitch_outcome_score
598,R,L,2,1,FF,96.4,-0.38,1.16,2430,0.34,2.83,677594,663728,668227,1,3,54.22,28,4,4-Seam Fastball,2,3,6,2,53.3,593871,False,0.25
969,R,L,2,0,FF,92.1,-0.78,1.51,2317,-0.91,3.24,0,0,0,2,4,54.53,30,3,4-Seam Fastball,0,1,5,2,55.7,668942,True,0.90
3836,R,R,0,0,CU,77.2,0.97,-1.28,2709,0.03,1.59,0,671732,0,1,1,54.45,6,1,Curveball,0,0,5,1,50.8,667670,True,0.75
2270,R,R,2,0,SL,84.6,0.21,0.38,2323,-0.12,1.87,805779,0,667670,2,1,54.5,5,3,Slider,0,0,6,1,48.6,669127,True,0.75
2906,R,L,1,0,FF,93.0,-1.02,1.23,2415,0.34,2.83,0,0,0,0,2,54.11,14,2,4-Seam Fastball,1,1,5,1,54.4,691019,True,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,R,R,0,0,CU,77.1,0.75,-1.08,2753,0.24,1.95,0,0,0,1,7,54.57,45,1,Curveball,0,0,5,3,59.5,677942,False,0.25
78,R,R,0,0,FF,92.5,-0.84,1.47,2424,-0.09,2.99,0,0,0,0,7,54.86,45,1,4-Seam Fastball,1,1,6,3,57.9,681624,True,0.75
1463,R,R,2,1,CU,77.3,0.46,-1.26,2651,-0.13,1.77,0,605346,0,2,3,54.56,21,4,Curveball,1,1,6,2,51.8,687263,True,0.75
663,R,R,2,0,FF,92.1,-0.71,1.63,2346,0.17,2.14,676572,642136,672569,2,4,54.55,37,3,4-Seam Fastball,2,6,6,2,57.8,605346,True,0.70


In [17]:
xcols = ['p_throws','stand','balls','strikes','pitch_type', 'release_speed', 'pfx_x', 'pfx_z', 'release_spin_rate','plate_x','plate_z','on_3b','on_2b','on_1b','outs_when_up','inning','release_pos_y','at_bat_number','pitch_number','bat_score','fld_score','pitcher_days_since_prev_game','n_thruorder_pitcher', 'arm_angle', 'batter']
ycol = "pitch_success"

In [18]:
numeric_cols = ['balls','strikes', 'release_speed', 'pfx_x', 'pfx_z', 'release_spin_rate','plate_x','plate_z','outs_when_up','inning','release_pos_y','at_bat_number','pitch_number','bat_score','fld_score','pitcher_days_since_prev_game','n_thruorder_pitcher', 'arm_angle']

In [19]:
categorical_cols = ['p_throws','stand','pitch_type','on_3b','on_2b','on_1b','batter']

In [30]:
USE_PCA = False  # set True to activate

if USE_PCA:
    n_components = 20 # you can tune this
    p = ColumnTransformer([
        ('scaler', StandardScaler(), numeric_cols),
        ("cat",OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int), categorical_cols),
        ('pca', PCA(n_components=n_components))
    ])
else:
    p = ColumnTransformer([
        ('scaler', StandardScaler(), numeric_cols),
        ("cat",OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int), categorical_cols)
    ])

model = GradientBoostingClassifier(n_estimators=200, max_depth=10, random_state=42)
#model = LogisticRegression(max_iter = 10000, class_weight = {False:0.5,True:0.5})

clf = Pipeline(steps=[
    ('preprocessor', p),
    ('model', model)
])


In [31]:
clf.fit(train[xcols], train[ycol])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['balls', 'strikes',
                                                   'release_speed', 'pfx_x',
                                                   'pfx_z', 'release_spin_rate',
                                                   'plate_x', 'plate_z',
                                                   'outs_when_up', 'inning',
                                                   'release_pos_y',
                                                   'at_bat_number',
                                                   'pitch_number', 'bat_score',
                                                   'fld_score',
                                                   'pitcher_days_since_prev_game',
                                                   'n_thruorder_pitcher',
                                                   'arm_angle']),
       

In [32]:
test.copy()
test["predicted"] = clf.predict(test[xcols])
test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["predicted"] = clf.predict(test[xcols])


Unnamed: 0,p_throws,stand,balls,strikes,pitch_type,release_speed,pfx_x,pfx_z,release_spin_rate,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,release_pos_y,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,pitcher_days_since_prev_game,n_thruorder_pitcher,arm_angle,batter,pitch_success,pitch_outcome_score,predicted
902,R,L,1,1,FF,93.4,-0.64,1.79,2477,-0.96,1.46,592626,0,605137,1,3,54.5,21,3,4-Seam Fastball,3,0,6,2,62.2,664983,False,0.5,False
453,R,L,0,0,CU,76.9,0.69,-0.71,2538,-1.17,3.11,0,0,0,0,6,54.4,49,1,Curveball,1,3,5,3,54.6,650859,False,0.5,False
2480,R,L,2,0,CH,84.4,-1.06,0.99,1702,-0.63,3.09,0,0,621566,1,1,54.48,9,3,Changeup,0,0,5,1,48.7,686948,True,0.75,True
441,R,L,0,0,CU,76.5,0.66,-1.12,2576,0.34,2.16,0,0,0,0,6,54.54,38,1,Curveball,1,0,5,3,51.3,669701,True,0.75,True
67,R,L,2,1,FF,91.3,-0.84,1.76,2357,0.39,2.83,0,0,0,0,6,54.54,54,4,4-Seam Fastball,2,6,6,3,61.7,664913,True,0.7,True


In [33]:
actual = test[ycol]
predicted = test["predicted"]

In [34]:
labels = [False,True]
cm = confusion_matrix(actual,predicted,labels)
cm = pd.DataFrame(cm, index=labels, columns=labels)
cm



Unnamed: 0,False,True
False,382,98
True,119,368


In [35]:
accuracy_score(actual,predicted)

0.7755946225439504

In [36]:
sampleRow = pd.Series(verClean[xcols].head(1).T.squeeze())

In [44]:
verlander_pitches = pitchData.loc[pitchData['player_name'] == 'Verlander, Justin', 'pitch_type']

# get unique pitch types and convert to a list

known_pitch_types = train['pitch_type'].astype(str).unique()
pitch_types = verlander_pitches.dropna().unique().tolist()
pitch_types = [str(p) for p in pitch_types if str(p) in known_pitch_types]

In [45]:
hf.recommend_pitch_vectorized_safe(sampleRow, pitch_types, clf, numeric_cols, categorical_cols)

('ST',
 {'ST': 0.4445328614585363,
  'CU': 0.3101709380161206,
  'FF': 0.3101709380161206,
  'CH': 0.3101709380161206,
  'SL': 0.3101709380161206,
  'SI': 0.3101709380161206})

In [None]:
### TODO: fix this nan issue in the helper functions
### TODO: split the notebook into a few different python files and call them from the notebook
###       for example, create an ETL .py, a model training .py, and a predictor .py.