In [1]:
# dependencies
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np

In [2]:
# read in batter dataset
batter = pd.read_csv('data/batter_clean.csv')

# create dummy variables from string values
model_data = pd.get_dummies(batter, columns = ['p_throws', 'pitch_type'])

# drop NA values
model_data.dropna(inplace = True)

# reset index
model_data.reset_index(inplace = True, drop = True)

# set target variable
target = model_data['events']

# drop target variable from model data
model_data.drop(columns = ['events'], inplace = True)

# preview data
model_data.head()

Unnamed: 0,release_pos_x,release_pos_z,release_speed,effective_speed,release_spin_rate,release_extension,plate_x,plate_z,balls,strikes,...,pitch_type_CS,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_FS,pitch_type_FT,pitch_type_KC,pitch_type_KN,pitch_type_SI,pitch_type_SL
0,-2.42,3.73,86.2,87.0,2439.0,6.8,0.06,3.04,1,0,...,0,0,0,1,0,0,0,0,0,0
1,-2.15,3.59,86.8,87.2,2209.0,6.8,1.51,2.48,0,0,...,0,0,0,0,0,0,0,0,1,0
2,-2.88,5.37,95.5,96.0,2035.0,6.5,-0.85,2.0,2,2,...,0,0,0,0,0,0,0,0,1,0
3,-2.86,5.38,93.3,93.6,2081.0,6.5,-0.81,1.68,2,2,...,0,0,0,0,0,0,0,0,1,0
4,-2.86,5.3,89.4,90.2,2257.0,6.6,-0.14,3.18,2,2,...,0,0,0,0,0,0,0,0,0,1


In [3]:
# split data into train & test sets, construct model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(model_data, target, random_state = 44)

# run model using grid search parameters
batter_model = RandomForestClassifier(max_depth = 10, min_samples_leaf = 1, 
                                      min_samples_split = 5, n_estimators = 300,
                                      random_state = 44)
batter_model = batter_model.fit(X_train, y_train)
batter_model.score(X_test, y_test)

0.6988210075026795

In [4]:
# read in pitcher dataset
pitcher = pd.read_csv('data/pitcher_clean.csv')

# create dummy variables from string values
sim_data = pd.get_dummies(pitcher, columns = ['p_throws', 'pitch_type'])

# drop NA values
sim_data.dropna(inplace = True)

# reset index
sim_data.reset_index(inplace = True, drop = True)

# drop target variable from sim data
sim_data.drop(columns = ['events'], inplace = True)

# make sure both dataframes have the same features
feature_difference = set(model_data) - set(sim_data)

feature_difference_df = pd.DataFrame(data = np.zeros((sim_data.shape[0], len(feature_difference))),
                                     columns = list(feature_difference))

sim_data = sim_data.join(feature_difference_df)

# preview data
sim_data.head()

Unnamed: 0,release_pos_x,release_pos_z,release_speed,effective_speed,release_spin_rate,release_extension,plate_x,plate_z,balls,strikes,...,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SI,pitch_type_CS,pitch_type_KN,pitch_type_SL,pitch_type_CU,p_throws_L,pitch_type_FS
0,-3.25,5.58,86.6,86.8,1523.0,6.7,-0.46,1.74,1,1,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,-3.32,5.56,86.2,86.5,1505.0,6.8,-0.85,0.58,0,1,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.94,5.93,95.1,95.0,2406.0,6.8,-0.18,3.34,0,0,...,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.92,5.98,93.6,94.1,2307.0,6.9,-0.42,2.18,1,1,...,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2.99,5.97,93.2,93.0,2423.0,6.7,0.24,2.09,1,0,...,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# check class output order for prediction values
print(np.unique(target))

['ball' 'double' 'hit_by_pitch' 'home_run' 'out' 'single' 'strike']


In [6]:
# empty lists to store data
ball = []
double = []
hbp = []
hr = []
out = []
single = []
strike = []

# iterate through predictions, append outcome probabilities to dictionary
for pitch in batter_model.predict_proba(sim_data):
    ball.append(pitch[0])
    double.append(pitch[1])
    hbp.append(pitch[2])
    hr.append(pitch[3])
    out.append(pitch[4])
    single.append(pitch[5])
    strike.append(pitch[6])
    
# assemble df
outcome_probabilities = pd.DataFrame({
    'ball': ball,
    'double': double,
    'hit_by_pitch': hbp,
    'home_run': hr,
    'out': out,
    'single': single,
    'strike': strike,
})

In [7]:
# create weighted pitch value
outcome_probabilities['wPitch'] = (outcome_probabilities['ball'] * 0.075) \
+ (outcome_probabilities['double'] * 1.238) \
+ (outcome_probabilities['hit_by_pitch'] * 0.728) \
+ (outcome_probabilities['home_run'] * 1.979) \
+ (outcome_probabilities['out'] * 0) \
+ (outcome_probabilities['single'] * 0.883) \
+ (outcome_probabilities['strike'] * -0.115)

outcome_probabilities.head(10)

Unnamed: 0,ball,double,hit_by_pitch,home_run,out,single,strike,wPitch
0,0.193368,0.014497,2.8e-05,0.01732,0.129859,0.043281,0.601647,0.035774
1,0.716416,0.001939,1.6e-05,0.003573,0.032539,0.017158,0.22836,0.052103
2,0.14923,0.011431,0.000705,0.006599,0.090849,0.021007,0.720179,-0.025355
3,0.100111,0.030381,0.000126,0.021331,0.158026,0.088799,0.601227,0.096693
4,0.126738,0.012179,0.000744,0.01175,0.069572,0.028719,0.750298,-0.012547
5,0.188778,0.006682,0.0004,0.01744,0.076042,0.018087,0.692571,-0.006439
6,0.143405,0.036407,1.1e-05,0.015694,0.137409,0.056663,0.61041,0.066731
7,0.612152,0.002368,1.6e-05,0.005083,0.055998,0.026526,0.297858,0.048083
8,0.561145,0.001036,1.4e-05,0.00013,0.030045,0.027461,0.38017,0.024163
9,0.121541,0.031215,0.000196,0.022154,0.225397,0.072009,0.527488,0.094668


In [12]:
# summarize data to estimate performance
pitches = len(outcome_probabilities)
balls = outcome_probabilities['ball'].sum()
strikes = outcome_probabilities['strike'].sum()
outs = outcome_probabilities['out'].sum()
singles = outcome_probabilities['single'].sum()
doubles = outcome_probabilities['double'].sum()
homers = outcome_probabilities['home_run'].sum()
avg_pitch = outcome_probabilities['wPitch'].mean()

print(f'Batter vs. Pitcher Simulation: \n\
------------------------------ \n\
Pitches: {pitches} \n\
Balls: {balls} \n\
Strikes: {strikes} \n\
Outs: {outs} \n\
Singles: {singles} \n\
Doubles: {doubles} \n\
Home Runs: {homers} \n\
Average Pitch Value: {avg_pitch}')

Batter vs. Pitcher Simulation: 
------------------------------ 
Pitches: 1880 
Balls: 628.9649558527772 
Strikes: 919.3170120382355 
Outs: 201.29939692212855 
Singles: 79.6608977631609 
Doubles: 26.676357311593165 
Home Runs: 22.866455556046404 
Average Pitch Value: 0.048379786703941934
