In [1]:
## Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm # linear regression two ways
from sklearn.model_selection import train_test_split # model selection
import plotly.graph_objects as go
import plotly.figure_factory as ff
import pylab as p
import mpl_toolkits.mplot3d.axes3d as p3
import plotly.express as px

In [2]:
# import data
pitching_data = pd.read_csv('pitching_statcast_data.csv')

In [3]:
# For Lefties, multiply their horizontal attributes by -1
horizontal_attributes = ['offspeed_avg_break_x', 'fastball_avg_break_x', 'breaking_avg_break_x', 'ff_avg_break_x', 'sl_avg_break_x', 'ch_avg_break_x', 'cu_avg_break_x', 'si_avg_break_x', 'fc_avg_break_x', 'fs_avg_break_x']

def sign_of_break_x(hand):
    
    if hand == 'R':
        return 1
    return -1

for attribute in horizontal_attributes:
    pitching_data[attribute] = pitching_data['pitch_hand'].map(sign_of_break_x) * pitching_data[attribute]

In [15]:
# created a dataframe with just the columns we wanted to use in it 
pd.set_option('mode.chained_assignment', None)

pitchers = pitching_data[['last_name',
                          ' first_name',
                          'year',
                          'offspeed_avg_spin',
                          'offspeed_avg_break_x',
                          'offspeed_avg_break_z',
                          'offspeed_avg_speed',
                          'breaking_avg_spin',
                          'breaking_avg_break_x',
                          'breaking_avg_break_z',
                          'breaking_avg_speed',
                          'fastball_avg_spin',
                          'fastball_avg_break_x',
                          'fastball_avg_break_z',
                          'fastball_avg_speed',
                          'sl_avg_spin',
                          'sl_avg_break_x',
                          'sl_avg_break_z',
                          'sl_avg_speed',
                          'n_sl_formatted',
                          'cu_avg_spin',
                          'cu_avg_break_x',
                          'cu_avg_break_z',
                          'cu_avg_speed',
                          'n_cukc_formatted',
                          'f_strike_percent',
                          'z_swing_miss_percent']]
pitchers['percent_offspeed_thrown'] = pitching_data[['n_offspeed_formatted']]
pitchers['percent_breaking_thrown'] = pitching_data[['n_breaking_formatted']]
pitchers['percent_fastball_thrown'] = pitching_data[['n_fastball_formatted']]
pitchers['delta'] = pitching_data['fastball_avg_speed'] - pitching_data['offspeed_avg_speed']
pitchers = pitchers.rename(columns = {
         
         'offspeed_avg_break_x' : 'offspeed_break_horizontal',
         'offspeed_avg_break_z' : 'offspeed_break_vertical',
         'offspeed_avg_speed' : 'offspeed_speed',
         'offspeed_avg_spin' : 'offspeed_spin',
         'breaking_avg_spin' : 'breaking_spin',
         'breaking_avg_break_x' : 'breaking_break_horizontal',
         'breaking_avg_break_z' : 'breaking_break_vertical',
         'breaking_avg_speed' : 'breaking_speed',
         'fastball_avg_spin' : 'fastball_spin',
         'fastball_avg_break_x' : 'fastball_break_horizontal',
         'fastball_avg_break_z' : 'fastball_break_vertical',
         'fastball_avg_speed' : 'fastball_speed',
         'f_strike_percent' : '1st_pitch_strike_percent',
         'z_contact_percent': 'z_contact_percent',
         'sl_avg_spin' : 'slider_spin',
         'sl_avg_break_x' : 'slider_break_horizontal',
         'sl_avg_break_z' : 'slider_break_vertical',
         'sl_avg_speed' : 'slider_speed',
         'n_sl_formatted' : 'percent_slider_thrown',
         'cu_avg_spin' : 'curveball_spin',
         'cu_avg_break_x' : 'curveball_break_horizontal',
         'cu_avg_break_z' : 'curveball_break_vertical',
         'cu_avg_speed' : 'curveball_speed',
         'n_cukc_formatted' : 'percent_curveball_thrown',
        'z_swing_miss_percent': 'z_contact_percent'
    
        })
pitchers['z_contact_percent'] = 100 - pitchers['z_contact_percent']
pitchers['WHIP'] = (pitching_data['p_total_hits'] + pitching_data['p_walk'])/pitching_data['p_formatted_ip']
pitchers['ERA'] = pitching_data['p_earned_run'] / (pitching_data['p_formatted_ip']/9)

In [5]:
# normalize each attribute
normalized_pitchers = pitchers.copy()
for col_name in pitchers.columns[3:]:
    if (col_name != 'z_contact_percent' and col_name != 'WHIP' and col_name != 'ERA'):
        normalized_pitchers[col_name] = (pitchers[col_name] - pitchers[col_name].min()) / (pitchers[col_name].max() - pitchers[col_name].min()) 

In [6]:
# Final Clean Dataframe
normalized_pitchers.head()

Unnamed: 0,last_name,first_name,year,offspeed_spin,offspeed_break_horizontal,offspeed_break_vertical,offspeed_speed,breaking_spin,breaking_break_horizontal,breaking_break_vertical,...,curveball_speed,percent_curveball_thrown,1st_pitch_strike_percent,z_contact_percent,percent_offspeed_thrown,percent_breaking_thrown,percent_fastball_thrown,delta,WHIP,ERA
0,Goody,Nick,2020,,,,,0.606111,0.527273,0.676471,...,,,0.518605,-20.8,,0.615385,0.383784,,2.0,9.0
1,Widener,Taylor,2020,0.521635,0.245283,0.65666,0.723602,0.615739,0.478788,0.716263,...,,,0.565116,-28.7,0.168269,0.224269,0.622703,0.368794,1.3,4.5
2,Urquidy,Jose,2020,0.572115,0.226415,0.669794,0.661491,0.708665,0.715152,0.456747,...,0.570248,0.196013,0.688372,-13.1,0.246394,0.268689,0.508108,0.390071,1.027397,2.773973
3,Tropeano,Nick,2020,0.459615,0.347709,0.493433,0.559006,0.521557,0.345455,0.583045,...,,,0.469767,-17.3,0.401442,0.383532,0.252973,0.421986,1.184211,1.184211
4,Thorpe,Lewis,2020,0.396635,0.342318,0.64728,0.599379,0.532022,0.515152,0.574394,...,0.451791,0.139535,0.674419,-14.2,0.167067,0.417118,0.431351,0.340426,2.111801,6.149068
