In [None]:
#!pip install pybaseball

Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl.metadata (11 kB)
Collecting pygithub>=1.51 (from pybaseball)
  Downloading PyGithub-2.6.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyGithub-2.6.1-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected

In [None]:
## Importing Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


In [None]:
## Loading in Data
sale = pd.read_csv('/content/sale_2024_cleaned.csv')
skubal = pd.read_csv('/content/skubal_2024_cleaned.csv')

In [None]:
sale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2815 entries, 0 to 2814
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pitch_type         2815 non-null   object 
 1   release_speed      2815 non-null   float64
 2   release_pos_x      2815 non-null   float64
 3   release_pos_y      2815 non-null   float64
 4   release_pos_z      2815 non-null   float64
 5   release_spin_rate  2815 non-null   int64  
 6   spin_axis          2815 non-null   int64  
 7   pfx_x              2815 non-null   float64
 8   pfx_z              2815 non-null   float64
 9   plate_x            2815 non-null   float64
 10  plate_z            2815 non-null   float64
 11  sz_top             2815 non-null   float64
 12  sz_bot             2815 non-null   float64
 13  balls              2815 non-null   int64  
 14  strikes            2815 non-null   int64  
 15  outs_when_up       2815 non-null   int64  
 16  inning             2815 

In [None]:
skubal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3098 entries, 0 to 3097
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pitch_type         3098 non-null   object 
 1   release_speed      3098 non-null   float64
 2   release_pos_x      3098 non-null   float64
 3   release_pos_y      3098 non-null   float64
 4   release_pos_z      3098 non-null   float64
 5   release_spin_rate  3098 non-null   float64
 6   spin_axis          3098 non-null   float64
 7   pfx_x              3098 non-null   float64
 8   pfx_z              3098 non-null   float64
 9   plate_x            3098 non-null   float64
 10  plate_z            3098 non-null   float64
 11  sz_top             3098 non-null   float64
 12  sz_bot             3098 non-null   float64
 13  balls              3098 non-null   int64  
 14  strikes            3098 non-null   int64  
 15  outs_when_up       3098 non-null   int64  
 16  inning             3098 

# Cleaning and Feature Engineering

Before fully cleaning, I wanted to do Exploratory Data Analysis to truly gain the full understanding of the data. I can now look into outliers, one-hot encode the data, and scale it. From there, I can do any more feature engineering I find necessary.

## Outliers

To deal with outliers, I will use the Inter-Quartile range, considering the outliers anything outside of the 1st and 3rd quartiles.

The columns I am going to check outliers for are:



*   Release_speed: Extreme values could be common
*   Release positions: Could indicate data error, random unusual mechanics, or misclassification
* Release Spin Rate and Axis: Tracking is a bit harder with spin, could be a tracking error.



In [None]:
columns_to_check = ['release_speed', 'release_pos_x', 'release_pos_z', 'release_pos_y', 'release_spin_rate', 'spin_axis']

def outliers (data, columns):
  for column in columns:
    q1 = data[column].quantile(0.25)
    q3 = data[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
  return data

sale_outliers = outliers(sale, columns_to_check)
skubal_outliers = outliers(skubal, columns_to_check)

In [None]:
## Investigating Sale's Outliers
sale_outliers

Unnamed: 0,pitch_type,release_speed,release_pos_x,release_pos_y,release_pos_z,release_spin_rate,spin_axis,pfx_x,pfx_z,plate_x,...,p_throws,bat_score,fld_score,at_bat_number,pitch_number,game_pk,batter,pitcher,description,events
0,FF,93.5,3.24,53.82,5.29,2192,114,1.21,0.78,-0.64,...,L,0,2,6,1,745602,656941,519242,called_strike,batter still up
1,FF,94.8,3.30,53.99,5.32,2289,121,1.23,1.21,0.23,...,L,0,2,6,2,745602,656941,519242,hit_into_play,home_run
2,FF,93.7,3.43,53.88,5.23,2246,116,1.17,1.24,1.27,...,L,1,2,7,1,745602,607208,519242,ball,batter still up
3,FF,93.2,3.45,54.07,5.29,2186,119,1.08,1.06,0.88,...,L,1,2,7,2,745602,607208,519242,called_strike,batter still up
4,SL,78.3,3.54,54.40,5.11,2414,314,-1.33,-0.59,-0.65,...,L,1,2,7,3,745602,607208,519242,swinging_strike,batter still up
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2810,FF,90.7,2.92,54.00,5.16,2238,116,1.32,0.87,-0.67,...,L,2,8,46,2,746663,668715,519242,foul,batter still up
2811,SL,73.3,2.99,54.29,5.01,2160,310,-0.84,-0.32,0.05,...,L,2,8,46,3,746663,668715,519242,foul,batter still up
2812,FF,90.3,2.87,53.95,5.31,2132,116,1.13,1.06,0.82,...,L,2,8,46,4,746663,668715,519242,ball,batter still up
2813,FF,92.5,2.96,53.93,5.16,2255,114,1.27,0.92,1.65,...,L,2,8,46,5,746663,668715,519242,ball,batter still up


2,712 out of 2,815 observations were considered outliers. Because of that, I am actually not going to delete any.

In [None]:
## Investigating Skubal's outliers
skubal_outliers

Unnamed: 0,pitch_type,release_speed,release_pos_x,release_pos_y,release_pos_z,release_spin_rate,spin_axis,pfx_x,pfx_z,plate_x,...,p_throws,bat_score,fld_score,at_bat_number,pitch_number,game_pk,batter,pitcher,description,events
0,FF,95.7,1.64,54.13,6.47,2228.775348,147.887674,0.71,1.47,-0.85,...,L,0,0,4,1,746821,643217,669373,ball,batter still up
1,SI,94.5,1.95,54.13,6.30,2117.539634,146.057927,1.41,1.05,0.50,...,L,0,0,4,2,746821,643217,669373,foul,batter still up
2,SL,86.4,1.89,54.13,6.38,2108.286682,164.611738,-0.12,0.48,-0.99,...,L,0,0,4,3,746821,643217,669373,ball,batter still up
3,FF,96.1,1.81,54.14,6.36,2228.775348,147.887674,0.54,1.46,-0.43,...,L,0,0,4,4,746821,643217,669373,swinging_strike,batter still up
4,FF,98.0,1.81,54.13,6.33,2228.775348,147.887674,0.81,1.38,0.33,...,L,0,0,4,5,746821,643217,669373,foul,batter still up
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,SL,87.3,1.58,54.02,6.21,2155.000000,163.000000,-0.31,0.63,-0.44,...,L,5,1,45,2,775326,678877,669373,hit_into_play,field_out
3093,SL,86.3,1.51,54.11,6.29,1976.000000,170.000000,-0.05,0.59,1.06,...,L,5,2,53,1,775326,666310,669373,swinging_strike,batter still up
3095,CH,87.3,1.33,54.17,6.28,1619.000000,137.000000,1.09,0.48,0.04,...,L,5,2,53,3,775326,666310,669373,swinging_strike_blocked,strikeout
3096,SI,96.1,1.28,54.08,6.31,1918.000000,157.000000,1.09,1.10,0.00,...,L,5,2,54,1,775326,665926,669373,hit_into_play,field_out


Again, 2,812 of Skubal's 3,098 pitches were marked as outliers. For the same reason as above, I am not going to delete any of these.

## One-Hot Encoding

RNNs require one-hot encoding to run because it expects numerical input. Any variable that is categorical will be one-hot encoded. It is especially important for the target variable: pitch_type.

In [None]:
columns_to_encode = ['pitch_type', 'inning_topbot', 'stand', 'p_throws']
def one_hot_encode (data, columns):
  for column in columns:
    data = pd.get_dummies(data, columns=[column], drop_first=True, dtype=int)
  return data

sale_encoded = one_hot_encode(sale, columns_to_encode)
skubal_encoded = one_hot_encode(skubal, columns_to_encode)

## Sequence Commonality

For modeling, all sequences need to be the same length. Because not every at-bat is the same, I need to do this manually. I will see how long at-bats are per pitcher and then remove any at-bats that aren't as long or remove pitches that go beyond this number.

In [None]:
def sequence_length(data):
  sequence_lengths = data.groupby('pitch_number')['pitch_number'].count()
  return sequence_lengths

sequence_length(sale_encoded)

Unnamed: 0_level_0,pitch_number
pitch_number,Unnamed: 1_level_1
1,702
2,631
3,533
4,410
5,286
6,135
7,70
8,31
9,11
10,5


We see the biggest drop off between 4 and 5 pitches and 5 and 6 pitches. I think for this, I want to stick to 4 pitches. Having 410 sequences to train on is much better than 286 or 135. Using 4 pitches balances having enough data and enough predictive power. Because this is just immediate predictions, I do not need longer sequences.

In [None]:
## Getting all the sequences to be length 4, and dropping the sequences that do not have 4 pitches
sale_encoded = sale_encoded[sale_encoded['pitch_number'] <= 4]

def short_sequences(data, pitch_num):
    sequences = data.groupby(['game_pk', 'at_bat_number'])['pitch_number'].count()

    # Filter to only keep combinations where the number of pitches is exactly 4
    valid_combinations = sequences[sequences == pitch_num]

    # Filter the original data to keep only the rows with these valid combinations
    filtered_data = data[data.set_index(['game_pk', 'at_bat_number']).index.isin(valid_combinations.index)]

    return filtered_data


sale_seq = short_sequences(sale_encoded, 4)

Doing the same for Tarik Skubal.

In [None]:
sequence_length(skubal_encoded)

Unnamed: 0_level_0,pitch_number
pitch_number,Unnamed: 1_level_1
1,819
2,734
3,605
4,437
5,271
6,140
7,55
8,24
9,9
10,3


4 also seems to be the best number here.

In [None]:
skubal_seq = short_sequences(skubal_encoded, 4)

## Extra Variables

There were some variables that were included for ease of EDA that aren't needed when actually modeling. These are variables that do not effect the sequence of the pitch and what happens, like variables that identify the game, but not the actual state of the game.

The dropped variables include:

* game_pk
* batter
* pitcher (we know who it is)
* at_bat_number
* bat_score
* fld_score
* events (data leakage)
* description (potential data leakage)

In [None]:
columns_to_drop = ['game_pk', 'batter', 'pitcher', 'at_bat_number', 'bat_score', 'fld_score', 'events', 'description']

sale_cleaned = sale_seq.drop(columns=columns_to_drop)
skubal_cleaned = skubal_seq.drop(columns=columns_to_drop)

## Normalizing Data

Normalizing/Scaling the data is important for RNNs for multiple reasons. It helps with efficiency, activation functions, and vanishing and exploding gradients. I am choosing to use the standard scaler based on personal preference of what I have felt works best in my past experience.

Note: I will not be normalizing pitch_number, as it represents the sequence number that I will later be testing.

In [None]:
## Initializing Scaler
scaler = StandardScaler()

## Separating 'pitch_number'
sale_pitch_number = sale_cleaned['pitch_number']
skubal_pitch_number = skubal_cleaned['pitch_number']

## Applying Scaler (excluding 'pitch_number')
sale_scaled = scaler.fit_transform(sale_cleaned.drop(columns=['pitch_number']))
skubal_scaled = scaler.fit_transform(skubal_cleaned.drop(columns=['pitch_number']))

## Back to a DataFrame and Reattach 'pitch_number'
sale_scaled = pd.DataFrame(sale_scaled, columns=sale_cleaned.columns.drop('pitch_number'))
skubal_scaled = pd.DataFrame(skubal_scaled, columns=skubal_cleaned.columns.drop('pitch_number'))

sale_scaled['pitch_number'] = sale_pitch_number.values
skubal_scaled['pitch_number'] = skubal_pitch_number.values


In [None]:
sale_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1632 entries, 0 to 1631
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   release_speed      1632 non-null   float64
 1   release_pos_x      1632 non-null   float64
 2   release_pos_y      1632 non-null   float64
 3   release_pos_z      1632 non-null   float64
 4   release_spin_rate  1632 non-null   float64
 5   spin_axis          1632 non-null   float64
 6   pfx_x              1632 non-null   float64
 7   pfx_z              1632 non-null   float64
 8   plate_x            1632 non-null   float64
 9   plate_z            1632 non-null   float64
 10  sz_top             1632 non-null   float64
 11  sz_bot             1632 non-null   float64
 12  balls              1632 non-null   float64
 13  strikes            1632 non-null   float64
 14  outs_when_up       1632 non-null   float64
 15  inning             1632 non-null   float64
 16  pitch_type_FF      1632 

In [None]:
skubal_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3098 entries, 0 to 3097
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   release_speed      3098 non-null   float64
 1   release_pos_x      3098 non-null   float64
 2   release_pos_y      3098 non-null   float64
 3   release_pos_z      3098 non-null   float64
 4   release_spin_rate  3098 non-null   float64
 5   spin_axis          3098 non-null   float64
 6   pfx_x              3098 non-null   float64
 7   pfx_z              3098 non-null   float64
 8   plate_x            3098 non-null   float64
 9   plate_z            3098 non-null   float64
 10  sz_top             3098 non-null   float64
 11  sz_bot             3098 non-null   float64
 12  balls              3098 non-null   float64
 13  strikes            3098 non-null   float64
 14  outs_when_up       3098 non-null   float64
 15  inning             3098 non-null   float64
 16  pitch_type_FF      3098 

## Final Important Step

Each sequence (1, 2, 3, 4) need a unique identifier for modeling. So, I need to add a column, seq_id, that does this.

In [58]:
def add_seq_id(data):
  seq_id = 1
  data['seq_id'] = 0

  for i in range(len(data)):
    if i % 4 == 0:
      data.loc[i, 'seq_id'] = seq_id
      seq_id += 1
    else:
      data.loc[i, 'seq_id'] = seq_id - 1
  return data

sale_scaled = add_seq_id(sale_scaled)
skubal_scaled = add_seq_id(skubal_scaled)

# What's Next?

Yay! The data is finally ready to be modeled! We encoded the variables, "dealt" with outliers, and normalized the data. I want to note that I choose to not perform Principal Component Analysis. With only 19 predictor variables, PCA is not necessary.

Now that the data is ready...


*   RNN Modeling
  * Hyperparameter Tuning
  * Model Validation and Evaluation
  * Model Comparison



In [60]:
## Printing the readied data to .csvs

sale_scaled.to_csv('sale_modeling.csv', index=False)
skubal_scaled.to_csv('skubal_modeling.csv', index=False)