In [None]:
import numpy as np
import pandas as pd
import pybaseball as pb


from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

import helperFunctions as hf
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
class DataFrameEncoder(TransformerMixin):

    def __init__(self):
        """Encode the data.

        Columns of data type object are appended in the list. After 
        appending Each Column of type object are taken dummies and 
        successively removed and two Dataframes are concated again.

        """
    def fit(self, X, y=None):
        self.object_col = []
        for col in X.columns:
            if(X[col].dtype == np.dtype('O')):
                self.object_col.append(col)
        return self

    def transform(self, X, y=None):
        dummy_df = pd.get_dummies(X[self.object_col],drop_first=False)
        X = X.drop(X[self.object_col],axis=1)
        X = pd.concat([dummy_df,X],axis=1)
        return X


In [None]:
pb.cache.enable()
pd.set_option("display.max_columns", None)

In [None]:
 pitching = pd.read_csv("data/lahman_1871-2024_csv/Pitching.csv")
 players = pd.read_csv("data/lahman_1871-2024_csv/People.csv")
 batting = pd.read_csv("data/lahman_1871-2024_csv/Batting.csv")
 appearances = pd.read_csv("data/lahman_1871-2024_csv/Appearances.csv")

In [None]:
pitchData = pb.statcast(start_dt="2024-03-01", end_dt="2025-10-31")

In [None]:
# Look at pitch movement columns
mainDF = pitchData[['stand','game_date','player_name','events','description','p_throws','hit_location','bb_type','balls','strikes','pitch_type', 'release_speed', 'pfx_x', 'pfx_z', 'release_spin_rate','plate_x','plate_z','on_3b','on_2b','on_1b','outs_when_up','inning','inning_topbot','release_pos_y','at_bat_number','pitch_number','pitch_name','bat_score','fld_score','post_bat_score','post_fld_score', 'delta_home_win_exp','delta_run_exp','bat_speed','swing_length','pitcher_days_since_prev_game','n_thruorder_pitcher', 'arm_angle', 'batter','pitcher']]

In [None]:
ver = mainDF[mainDF['player_name'] == 'Verlander, Justin'].copy()

In [None]:
ver["events"].value_counts()

In [None]:
ver["pitch_success"] = np.where(
    (
        ver["events"].isin([
            'field_out', 'strikeout', 'field_error', 'fielders_choice',
            'double_play', 'grounded_into_double_play', 'force_out'
        ])
    ) |
    (
        ver["description"].isin(['called_strike', 'swinging_strike'])
    ) |
    (
        ver["description"].isin(['foul']) & (ver['strikes'] != 2)
    ),
    True,
    False
)

In [None]:
conditions = [
    ver["events"].isin(['double_play', 'grounded_into_double_play']),
    ver["events"].isin(['strikeout']), 
    ver["events"].isin(['field_out']),
    ver["description"].isin(['swinging_strike', 'called_strike']),
    ver["description"].isin(['foul', 'foul_tip']),
    ver["description"].isin(['ball']),
    ver["description"].isin(['hit_by_pitch']),
    ver["events"].isin(['single']),  # bad outcomes
    ver["events"].isin(['double']),  # bad outcomes
    ver["events"].isin(['triple']),  # bad outcomes
    ver["events"].isin(['home_run']),  # bad outcomes
]

values = [1.00, 0.95, 0.90, 0.75, 0.70, 0.50, 0.25, 0.25, 0.20, 0.15, 0]

ver["pitch_outcome_score"] = np.select(conditions, values, default=np.nan)

In [None]:
np.select(conditions, values, default=np.nan)

In [None]:
ver["on_3b"] = ver["on_3b"].fillna(0)
ver["on_2b"] = ver["on_2b"].fillna(0)
ver["on_1b"] = ver["on_1b"].fillna(0)

In [None]:
ver["pitch_success"].value_counts()

In [None]:
verClean = ver[['p_throws','stand','balls','strikes','pitch_type', 'release_speed', 'pfx_x', 'pfx_z', 'release_spin_rate','plate_x','plate_z','on_3b','on_2b','on_1b','outs_when_up','inning','release_pos_y','at_bat_number','pitch_number','pitch_name','bat_score','fld_score','pitcher_days_since_prev_game','n_thruorder_pitcher', 'arm_angle', 'batter', 'pitch_success','pitch_outcome_score']]

In [None]:
verClean = verClean.dropna()

In [None]:
train, test = train_test_split(verClean, stratify=verClean["pitch_success"])
train

In [None]:
xcols = ['p_throws','stand','balls','strikes','pitch_type', 'release_speed', 'pfx_x', 'pfx_z', 'release_spin_rate','plate_x','plate_z','on_3b','on_2b','on_1b','outs_when_up','inning','release_pos_y','at_bat_number','pitch_number','bat_score','fld_score','pitcher_days_since_prev_game','n_thruorder_pitcher', 'arm_angle', 'batter']
ycol = "pitch_success"

In [None]:
USE_PCA = False  # set True to activate

if USE_PCA:
    n_components = 20 # you can tune this
    p = Pipeline([
        ("onehot",OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)),
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=n_components))
    ])
else:
    p = Pipeline([
        ("onehot",OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)),
        ('scaler', StandardScaler())
    ])

model = GradientBoostingClassifier(n_estimators=200, max_depth=10, random_state=42)
#model = LogisticRegression(max_iter = 10000, class_weight = {False:0.5,True:0.5})

clf = Pipeline(steps=[
    ('preprocessor', p),
    ('model', model)
])


In [None]:
clf.fit(train[xcols], train[ycol])

In [None]:
test.copy()
test["predicted"] = clf.predict(test[xcols])
test.head()

In [None]:
actual = test[ycol]
predicted = test["predicted"]

In [None]:
labels = [False,True]
cm = confusion_matrix(actual,predicted,labels)
cm = pd.DataFrame(cm, index=labels, columns=labels)
cm

In [None]:
accuracy_score(actual,predicted)

In [None]:
sampleRow = pd.Series(verClean[xcols].head(1).T.squeeze())

In [None]:
verlander_pitches = pitchData.loc[pitchData['player_name'] == 'Verlander, Justin', 'pitch_type']

# get unique pitch types and convert to a list
pitch_types = verlander_pitches.notnull().unique().tolist()

In [None]:
numeric_cols = ['balls','strikes', 'release_speed', 'pfx_x', 'pfx_z', 'release_spin_rate','plate_x','plate_z','outs_when_up','inning','release_pos_y','at_bat_number','pitch_number','bat_score','fld_score','pitcher_days_since_prev_game','n_thruorder_pitcher', 'arm_angle']

In [None]:
categorical_cols = ['p_throws','stand','pitch_type','on_3b','on_2b','on_1b','batter']

In [None]:
hf.recommend_pitch_vectorized(sampleRow, pitch_types, clf, numeric_cols, categorical_cols)

In [None]:
### TODO: fix this nan issue in the helper functions
### TODO: split the notebook into a few different python files and call them from the notebook
###       for example, create an ETL .py, a model training .py, and a predictor .py.