This script contains pitch and hit variables from all balls in play in the MLB 
2021 season. Data was gathered from BaseballSavant.com.

## 1. Importing data and libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.preprocessing import StandardScaler

In [None]:
path = r'/Users/yourname/Datasets'

In [None]:
df = pd.read_pickle(os.path.join(path, 'baseball.pkl'))

In [None]:
df.head()

Unnamed: 0,pitch_type,game_date,release_speed,player_name,batter,pitcher,events,zone,stand,p_throws,...,batter_home_away,contact,runner_1b,runner_2b,runner_3b,scoring_play,Latitude,Longitude,park_city,park_state
0,FC,2021-04-30,82.7,"Altuve, Jose",514888,642232,field_out,4.0,R,L,...,away,Under,,,,False,27.768284,-82.653961,St. Petersburg,Florida
1,FC,2021-04-30,82.4,"Maldonado, Martín",455117,642232,field_out,8.0,R,L,...,away,Flare/Burner,,,,False,27.768284,-82.653961,St. Petersburg,Florida
2,CH,2021-04-30,83.8,"Kiermaier, Kevin",595281,621121,field_out,5.0,L,R,...,home,Under,,,,False,27.768284,-82.653961,St. Petersburg,Florida
3,FC,2021-04-30,82.7,"Straw, Myles",664702,642232,field_out,13.0,R,L,...,away,Under,,,,False,27.768284,-82.653961,St. Petersburg,Florida
4,SL,2021-04-30,88.0,"Díaz, Yandy",650490,621121,field_out,8.0,R,R,...,home,Barrel,,1.0,,False,27.768284,-82.653961,St. Petersburg,Florida


In [None]:
df.reset_index(inplace=True)
df.rename(columns={'index':'id'}, inplace=True)
df.head()

Unnamed: 0,id,pitch_type,game_date,release_speed,player_name,batter,pitcher,events,zone,stand,...,batter_home_away,contact,runner_1b,runner_2b,runner_3b,scoring_play,Latitude,Longitude,park_city,park_state
0,0,FC,2021-04-30,82.7,"Altuve, Jose",514888,642232,field_out,4.0,R,...,away,Under,,,,False,27.768284,-82.653961,St. Petersburg,Florida
1,1,FC,2021-04-30,82.4,"Maldonado, Martín",455117,642232,field_out,8.0,R,...,away,Flare/Burner,,,,False,27.768284,-82.653961,St. Petersburg,Florida
2,2,CH,2021-04-30,83.8,"Kiermaier, Kevin",595281,621121,field_out,5.0,L,...,home,Under,,,,False,27.768284,-82.653961,St. Petersburg,Florida
3,3,FC,2021-04-30,82.7,"Straw, Myles",664702,642232,field_out,13.0,R,...,away,Under,,,,False,27.768284,-82.653961,St. Petersburg,Florida
4,4,SL,2021-04-30,88.0,"Díaz, Yandy",650490,621121,field_out,8.0,R,...,home,Barrel,,1.0,,False,27.768284,-82.653961,St. Petersburg,Florida


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121707 entries, 0 to 121706
Data columns (total 58 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     121707 non-null  int64  
 1   pitch_type             121654 non-null  object 
 2   game_date              121707 non-null  object 
 3   release_speed          121653 non-null  float64
 4   player_name            121707 non-null  object 
 5   batter                 121707 non-null  int64  
 6   pitcher                121707 non-null  int64  
 7   events                 121707 non-null  object 
 8   zone                   121654 non-null  float64
 9   stand                  121707 non-null  object 
 10  p_throws               121707 non-null  object 
 11  home_team              121707 non-null  object 
 12  away_team              121707 non-null  object 
 13  hit_location           115340 non-null  float64
 14  bb_type                121703 non-nu

# 2 Scaling

### 2.1 Subset data

In [None]:
#identify the useful numeric columns

columns = ['release_speed',
          'pfx_x',
          'pfx_z',
          'plate_x',
          'plate_z',
          'vx0',
          'vy0',
          'vz0',
          'ax',
          'ay',
          'az',
          'hit_distance_sc',
          'launch_speed',
          'launch_angle']

In [None]:
#subset for just the home runs

df2 = df[columns].loc[df['events'] == 'home_run']

In [None]:
df2.head()

Unnamed: 0,release_speed,pfx_x,pfx_z,plate_x,plate_z,vx0,vy0,vz0,ax,ay,az,hit_distance_sc,launch_speed,launch_angle
20,77.1,1.42,0.72,0.16,1.99,-8.067947,-111.990784,-1.318431,13.510652,22.445496,-26.092514,405.0,103.0,27.0
56,88.6,0.14,0.82,0.71,2.28,4.478044,-129.057787,-3.210782,0.625105,26.799068,-22.55628,431.0,109.6,35.0
59,89.0,1.05,0.85,0.24,2.55,-3.577797,-129.55196,-4.374543,12.599927,28.755035,-21.928535,382.0,103.9,22.0
105,89.9,1.33,1.07,0.73,2.65,-5.732619,-130.785103,-4.905889,16.517801,29.189072,-19.022148,466.0,112.8,35.0
137,96.2,-1.36,0.97,-0.26,2.61,5.752009,-139.900294,-5.508058,-19.151922,33.500467,-18.35968,417.0,105.4,29.0


### 2.2 Check for (and handle) missing values

In [None]:
df2.isnull().sum()

release_speed       9
pfx_x               9
pfx_z               9
plate_x             9
plate_z             9
vx0                 9
vy0                 9
vz0                 9
ax                  9
ay                  9
az                  9
hit_distance_sc    11
launch_speed       11
launch_angle       11
dtype: int64

In [None]:
#eliminate those records that have NaN values

df2.dropna(inplace=True)

In [None]:
df2.shape

(5933, 14)

### 2.3 Scaling

In [None]:
#create scaler object using StandardScaler from sklearn.preprocessing
#StandardScaler assumes data is normally distributed and scales with a distribution
#around 0 and standard deviation of 1. Scaling happens independently with each variable.

scaler = StandardScaler()

In [None]:
#create new df with scaled data

df_scaled = pd.DataFrame(scaler.fit_transform(df2), columns=columns)
df_scaled.head()

Unnamed: 0,release_speed,pfx_x,pfx_z,plate_x,plate_z,vx0,vy0,vz0,ax,ay,az,hit_distance_sc,launch_speed,launch_angle
0,-1.977271,1.869974,-0.073341,0.393492,-0.98044,-1.779509,2.009087,0.992534,1.536201,-1.0311,-0.466783,0.158925,-0.323204,-0.326891
1,-0.052922,0.312912,0.071211,1.730794,-0.394123,0.447199,0.034001,0.28412,0.266127,0.067151,-0.051368,1.162874,1.16234,1.199178
2,0.014012,1.419886,0.114577,0.588009,0.151758,-0.982581,-0.023188,-0.151541,1.446435,0.560572,0.022375,-0.729184,-0.120629,-1.280684
3,0.164613,1.760493,0.432592,1.779424,0.353936,-1.365026,-0.165894,-0.350454,1.832603,0.670064,0.363799,2.514343,1.882604,1.199178
4,1.218822,-1.511771,0.28804,-0.62772,0.273065,0.673307,-1.220753,-0.57588,-1.683211,1.757675,0.441622,0.622286,0.216994,0.054626


In [None]:
# Compare the original vs. scaled data

df2.head()

Unnamed: 0,release_speed,pfx_x,pfx_z,plate_x,plate_z,vx0,vy0,vz0,ax,ay,az,hit_distance_sc,launch_speed,launch_angle
20,77.1,1.42,0.72,0.16,1.99,-8.067947,-111.990784,-1.318431,13.510652,22.445496,-26.092514,405.0,103.0,27.0
56,88.6,0.14,0.82,0.71,2.28,4.478044,-129.057787,-3.210782,0.625105,26.799068,-22.55628,431.0,109.6,35.0
59,89.0,1.05,0.85,0.24,2.55,-3.577797,-129.55196,-4.374543,12.599927,28.755035,-21.928535,382.0,103.9,22.0
105,89.9,1.33,1.07,0.73,2.65,-5.732619,-130.785103,-4.905889,16.517801,29.189072,-19.022148,466.0,112.8,35.0
137,96.2,-1.36,0.97,-0.26,2.61,5.752009,-139.900294,-5.508058,-19.151922,33.500467,-18.35968,417.0,105.4,29.0


In [None]:
df_scaled.to_pickle(os.path.join(path, 'baseball_scaled.pkl'))