<a href="https://colab.research.google.com/github/seanjhannon/pitch-prediction/blob/main/pitch_prediction_time_series_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Load in kershaw data

# Lots of transforming work - create a pipeline to prepare every possible column for use in time series modeling

**Transforming**
* Create ID column

* Feature Engineering - what variables should we modify / create? (focus on modify first)

* Scale Numeric

* OHE Categorical


# Data Loading

## Imports

In [6]:
pip install pybaseball

Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/426.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.1/426.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting pygithub>=1.51 (from pybaseball)
  Downloading PyGithub-2.3.0-py3-none-any.whl (354 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.4/354.4 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyjwt[cry

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt


# pybaseball imports
from pybaseball import  playerid_lookup
from pybaseball import  statcast_pitcher
from pybaseball import statcast_batter
from pybaseball import statcast_running
from pybaseball import playerid_reverse_lookup
from pybaseball import batting_stats
from pybaseball import get_splits
from pybaseball import team_game_logs

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix


In [8]:
def get_pitches(
    fname: str,
    lname: str,
):
  """
  Returns all pitches for a specified pitcher between the start of hhis career (or 2008) and today.

  Params
    fname (string): Pitcher's first name.
    lname (string): Pitcher's last name.

  Returns
    pitches (pandas DataFrame): DataFrame of statcast data for all pitches in the player's career.
  """
  # Search for player 'lookup' - returns their metadata
  player_lookup = playerid_lookup(lname, fname)

  if len(player_lookup) != 1:
    print(f'Player Lookup returned {len(player_lookup)} results. Please double-check inputs and try again.')
    return None

  # Collect the player's lmbam_id and start and end of his career
  player_id = player_lookup['key_mlbam'][0]
  player_career_start = int(player_lookup['mlb_played_first'][0])
  player_career_end = int(player_lookup['mlb_played_last'][0])

  # Separately define the start and end dates for our pitching stats query and format them as str
  pitching_start = dt.date(player_career_start, 1, 1).strftime('%Y-%m-%d') # Jan 1 of their first career year
  pitching_end = dt.date.today().strftime('%Y-%m-%d') # Ok to overshoot the end date, maybe revisit this later

  pitches = statcast_pitcher(start_dt=pitching_start,
                             end_dt=pitching_end,
                             player_id=player_id)

  if len(pitches) == 0:
    print(f'No pitches found for {fname} {lname}')

  return pitches

## Load Kershaw Data

# Feature Engineering


## Pipeline

Custom transformers

shift data up 1 row from target

Transform numeric and categorical

In [11]:
# FULL PIPELINE

# Load in data
raw_data = get_pitches('clayton', 'kershaw')
raw_data = raw_data.dropna(subset='pitch_type') # remove rows where no pitch was thrown

Gathering Player Data


  df = pd.read_csv(io.StringIO(data.text))


In [47]:


# Define X and y
target = 'pitch_type'

X = raw_data.drop(columns=[target])
y = raw_data[target]

# Impute the batter column /////////////////////////////////

# Get start and end of pitcher careers
start_date = raw_data['game_date'].sort_values().iloc[0]
end_date = raw_data['game_date'].sort_values().iloc[-1]
# Fetch batting stats range
from pybaseball import batting_stats_range
batter_stats = batting_stats_range(start_dt=start_date, end_dt = end_date
                                     )[['Name', 'BA']]
batter_stats['Name'] = batter_stats['Name'].apply(str.lower)
batter_stats
# Collect all batter Ids
batters_faced = raw_data['batter'].unique()
# Reverse lookup on batter IDs
from pybaseball import playerid_reverse_lookup
batters_lookup = playerid_reverse_lookup(player_ids = batters_faced)
batters_lookup['Name'] = batters_lookup['name_first'] + ' ' + batters_lookup['name_last']
# Merge lookup with stats range left on a concat name column
ids_with_ba = batters_lookup[['key_mlbam', 'Name']
                             ].merge(batter_stats, on='Name', how='left') # sub 20% NaN - largely due to accents
# Use a dictionary for optimal time complexity given a large raw data frame
id_to_ba = dict(zip(ids_with_ba['key_mlbam'], ids_with_ba['BA']))
# Replacing id withh avg - pulling batters from raw_data allows this cell to be rerun without breaking
X['batter'] = raw_data['batter'].map(id_to_ba)

# Identify and Remove Unwanted columns from X
unwanted_cols = [
    'game_date', 'sv_id', # Unlikely to be helpful, high cardinality
    'player_name', 'pitcher', # Redundant
    'spin_rate_deprecated', 'break_angle_deprecated', 'spin_dir', # columns marked as deprecated in the documentation
    'break_length_deprecated', 'tfs_deprecated', 'tfs_zulu_deprecated','umpire',
]
nlp_cols = [
    'des'
]

X = X.drop(columns=unwanted_cols + nlp_cols, # avoid handling for now
           errors='ignore')

# Validate data in y, replace typos

possible_pitch_types = [
  'AB' ,  'AS' , 'CH' , 'CU' , 'EP' , 'FC' , 'FF' , 'FO' , 'FS' , 'FT' , 'GY' ,
  'IN' , 'KC' , 'KN' , 'NP' , 'PO' , 'SC' , 'SI' , 'SL' , 'UN'
]


def validate_and_replace(target_series, acceptable_values):
    return target_series.apply(lambda x: x if x in acceptable_values else 'FF')

_validated = validate_and_replace(y, possible_pitch_types)

# Label Encoding
from sklearn.preprocessing import LabelEncoder
leEncoder = LabelEncoder()
leEncoder.fit(possible_pitch_types)

y_encoded = leEncoder.transform(y_validated)

# Shift up 1
X_shifted = X.shift(1)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X_shifted, y_encoded, test_size=0.2, random_state=42)

# Scale numeric - defined as all columns of int64 or float64 type - most nan numeric values are for stats only tracked in the case of a hit
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

from sklearn.impute import SimpleImputer
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


col_transformer = ColumnTransformer(transformers=[
    # Impute batters first
    ('num_pipeline', num_pipeline, numeric_features),
    ('cat_pipeline', cat_pipeline, categorical_features),
    # Add in runners last
],
    remainder='drop'
)

from sklearn.pipeline import make_pipeline
pipe_final = Pipeline(steps=[
    ('preprocessor', col_transformer)
])



pipe_final.fit(X_train, y_train)


ohe_feature_names = pipe_final['preprocessor'].transformers_[1][1]\
                    ['ohe'].get_feature_names_out(categorical_features)

final_cols = list(numeric_features) + list(ohe_feature_names)

In [46]:
X_train_transformed['on_3b']

0       -0.255556
1       -0.255556
2        3.265753
3        3.337361
4       -0.255556
           ...   
35279    3.052301
35280   -0.255556
35281   -0.255556
35282    2.101354
35283   -0.255556
Name: on_3b, Length: 35284, dtype: float64

Unnamed: 0,release_speed,release_pos_x,release_pos_z,batter,zone,hit_location,balls,strikes,game_year,pfx_x,...,pitch_name_Sinker,pitch_name_Slider,pitch_name_Split-Finger,if_fielding_alignment_Infield shade,if_fielding_alignment_Infield shift,if_fielding_alignment_Standard,if_fielding_alignment_Strategic,of_fielding_alignment_Extreme outfield shift,of_fielding_alignment_Standard,of_fielding_alignment_Strategic
0,0.944163,5.357641,-1.700764,0.862268,0.783331,-0.473125,1.263942,1.315015,0.280257,2.004440,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-2.224217,-0.673141,0.446678,0.809029,-1.578276,-0.473125,-0.871466,0.105385,0.746777,-1.166526,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.049060,0.064247,0.388639,0.159510,1.019492,-0.473125,1.263942,0.105385,0.046997,-1.412337,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.176932,0.617288,0.910990,0.340523,1.019492,-0.473125,-0.871466,1.315015,-0.186263,-1.584405,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.560548,0.380270,-0.888218,0.606720,-1.578276,-0.473125,2.331646,-1.104245,-0.886043,0.013368,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35279,1.143075,-0.225441,-0.191750,0.383115,0.547171,-0.473125,-0.871466,0.105385,-0.652783,1.316168,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
35280,-0.732379,0.301265,1.897653,0.265988,-0.869793,-0.473125,1.263942,1.315015,-1.119303,-1.068201,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
35281,-0.021980,0.116918,-0.539984,0.425706,1.019492,-0.473125,0.196238,-1.104245,1.913078,-1.117364,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
35282,0.631587,0.090582,-0.133711,0.691903,1.019492,-0.473125,-0.871466,-1.104245,-0.419523,0.554153,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [41]:
list(numeric_features) + list(ohe_feature_names)

['release_speed',
 'release_pos_x',
 'release_pos_z',
 'batter',
 'zone',
 'hit_location',
 'balls',
 'strikes',
 'game_year',
 'pfx_x',
 'pfx_z',
 'plate_x',
 'plate_z',
 'on_3b',
 'on_2b',
 'on_1b',
 'outs_when_up',
 'inning',
 'hc_x',
 'hc_y',
 'fielder_2',
 'vx0',
 'vy0',
 'vz0',
 'ax',
 'ay',
 'az',
 'sz_top',
 'sz_bot',
 'hit_distance_sc',
 'launch_speed',
 'launch_angle',
 'effective_speed',
 'release_spin_rate',
 'release_extension',
 'game_pk',
 'pitcher.1',
 'fielder_2.1',
 'fielder_3',
 'fielder_4',
 'fielder_5',
 'fielder_6',
 'fielder_7',
 'fielder_8',
 'fielder_9',
 'release_pos_y',
 'estimated_ba_using_speedangle',
 'estimated_woba_using_speedangle',
 'woba_value',
 'woba_denom',
 'babip_value',
 'iso_value',
 'launch_speed_angle',
 'at_bat_number',
 'pitch_number',
 'home_score',
 'away_score',
 'bat_score',
 'fld_score',
 'post_away_score',
 'post_home_score',
 'post_bat_score',
 'post_fld_score',
 'spin_axis',
 'delta_home_win_exp',
 'delta_run_exp',
 'events_catcher_

In [36]:
ohe_feature_names.tolist()

['events_catcher_interf',
 'events_caught_stealing_2b',
 'events_caught_stealing_home',
 'events_double',
 'events_double_play',
 'events_field_error',
 'events_field_out',
 'events_fielders_choice',
 'events_fielders_choice_out',
 'events_force_out',
 'events_grounded_into_double_play',
 'events_hit_by_pitch',
 'events_home_run',
 'events_intent_walk',
 'events_pickoff_caught_stealing_3b',
 'events_sac_bunt',
 'events_sac_fly',
 'events_single',
 'events_strikeout',
 'events_strikeout_double_play',
 'events_triple',
 'events_walk',
 'description_ball',
 'description_blocked_ball',
 'description_bunt_foul_tip',
 'description_called_strike',
 'description_foul',
 'description_foul_bunt',
 'description_foul_tip',
 'description_hit_by_pitch',
 'description_hit_into_play',
 'description_intent_ball',
 'description_missed_bunt',
 'description_swinging_strike',
 'description_swinging_strike_blocked',
 'game_type_D',
 'game_type_F',
 'game_type_L',
 'game_type_R',
 'game_type_S',
 'game_type_

In [4]:
class BatterEncoder(BaseEstimator, TransformerMixin):
  # Creates column batter_avg, drops 'batter'

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    X['batter'] = X['batter'].apply(get_avg)
    #X = X.drop('batter')
    return X

# Model Testing
Start with a baseline model that predicts the most recent pitch