In [53]:
import pandas as pd
import numpy
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve


# ! pip install statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf


import warnings
warnings.filterwarnings('ignore')



# 1. Data Cleaning

In [27]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,PID,INNING_KEY,BATTER_IN_INNING_KEY,PITCH_NUMBER,OUT_KEY,BALLS,STRIKES,IS_RUNNER_ON_1B,IS_RUNNER_ON_2B,IS_RUNNER_ON_3B,...,HORIZONTAL_BREAK,SPIN_RATE_ABSOLUTE,RELEASE_SPEED,RELEASE_SIDE,RELEASE_HEIGHT,RELEASE_EXTENSION,HORIZONTAL_APPROACH_ANGLE,VERTICAL_APPROACH_ANGLE,PLATE_X,PLATE_Z
0,197,1,1,1,0,0,0,0,0,0,...,-12.3603,2402.459961,100.764999,-2.42717,6.0855,6.46,1.61299,-5.76003,0.197045,1.91282
1,348,1,2,1,1,0,0,0,0,0,...,-10.9503,2273.459961,100.431,-2.52253,5.97044,6.56,0.631016,-4.6099,-0.943079,2.87744
2,1072,1,2,2,1,1,0,0,0,0,...,4.35335,2314.459961,89.848999,-2.64242,5.8941,6.71,3.33048,-6.76551,0.012025,2.16833
3,420,1,3,1,1,0,0,1,0,0,...,4.27254,2340.459961,89.810997,-2.52429,5.90717,6.55,3.80423,-6.87764,0.584736,2.11319
4,198,1,3,2,1,0,1,1,0,0,...,1.62418,2445.459961,89.014397,-2.68096,6.10905,6.37,2.38521,-5.89879,-0.634335,3.45222


In [28]:
data.shape

(9889, 26)

In [29]:
data.columns

Index(['PID', 'INNING_KEY', 'BATTER_IN_INNING_KEY', 'PITCH_NUMBER', 'OUT_KEY',
       'BALLS', 'STRIKES', 'IS_RUNNER_ON_1B', 'IS_RUNNER_ON_2B',
       'IS_RUNNER_ON_3B', 'PITCHER_KEY', 'THROW_SIDE_KEY',
       'PITCH_TYPE_TRACKED_KEY', 'EVENT_RESULT_KEY', 'PITCH_RESULT_KEY',
       'INDUCED_VERTICAL_BREAK', 'HORIZONTAL_BREAK', 'SPIN_RATE_ABSOLUTE',
       'RELEASE_SPEED', 'RELEASE_SIDE', 'RELEASE_HEIGHT', 'RELEASE_EXTENSION',
       'HORIZONTAL_APPROACH_ANGLE', 'VERTICAL_APPROACH_ANGLE', 'PLATE_X',
       'PLATE_Z'],
      dtype='object')

In [30]:
data.PITCHER_KEY.value_counts()

PITCHER_KEY
668933    1187
671096     776
682227     724
596133     676
668881     516
594902     494
666157     463
571656     434
664747     427
518585     413
664139     401
608371     394
571882     387
622065     334
643361     255
650960     186
570666     184
665665     176
686651     171
674285     148
668984     144
656818     135
594580     106
683175      95
680689      94
611093      91
686730      87
641427      65
592741      42
664028      42
592527      41
621219      40
669270      37
622088      36
691094      34
571912      29
613564      25
Name: count, dtype: int64

In [31]:
data.columns[data.isna().any()]

Index(['EVENT_RESULT_KEY'], dtype='object')

### Data does not have any NA values besides in the column EVENT_RESULT_KEY. Now we attempt to remove outliers or pitches that were not tracked correctly. These will like likely be in the pitch data such as break and spin rate.

In [32]:
data.drop(columns = ['INDUCED_VERTICAL_BREAK', 'HORIZONTAL_BREAK', 'SPIN_RATE_ABSOLUTE','RELEASE_SPEED']).describe()

Unnamed: 0,PID,INNING_KEY,BATTER_IN_INNING_KEY,PITCH_NUMBER,OUT_KEY,BALLS,STRIKES,IS_RUNNER_ON_1B,IS_RUNNER_ON_2B,IS_RUNNER_ON_3B,PITCHER_KEY,RELEASE_SIDE,RELEASE_HEIGHT,RELEASE_EXTENSION,HORIZONTAL_APPROACH_ANGLE,VERTICAL_APPROACH_ANGLE,PLATE_X,PLATE_Z
count,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0,9889.0
mean,12609.406007,5.007786,2.935079,2.878653,0.997674,0.882293,0.87481,0.309839,0.174638,0.102033,637412.410254,-0.668325,5.825911,6.226203,0.768435,-6.415098,0.072584,2.33881
std,7802.242707,2.660815,1.741784,1.714279,0.822356,0.965159,0.822483,0.462451,0.379677,0.302706,44958.635446,1.758907,0.46013,0.56198,2.27013,1.831539,0.822927,0.975443
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,518585.0,-3.30577,4.33006,3.79,-8.81383,-13.8425,-3.58865,-1.89267
25%,6004.0,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,596133.0,-1.80515,5.65929,5.78,-0.463435,-7.7,-0.479877,1.68921
50%,12590.0,5.0,3.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,664747.0,-1.31615,5.83533,6.16,0.898488,-6.3654,0.073093,2.34019
75%,19511.0,7.0,4.0,4.0,2.0,2.0,2.0,1.0,0.0,0.0,668933.0,0.550795,6.10905,6.63,2.29852,-5.02905,0.621404,3.00133
max,28755.0,11.0,14.0,14.0,2.0,3.0,2.0,1.0,1.0,1.0,691094.0,4.02639,7.09096,8.04,7.15821,-1.1854,3.98316,6.07933


In [33]:
data[['INDUCED_VERTICAL_BREAK', 'HORIZONTAL_BREAK', 'SPIN_RATE_ABSOLUTE','RELEASE_SPEED']].describe()

Unnamed: 0,INDUCED_VERTICAL_BREAK,HORIZONTAL_BREAK,SPIN_RATE_ABSOLUTE,RELEASE_SPEED
count,9889.0,9889.0,9889.0,9889.0
mean,6.937282,-0.370263,2293.930972,89.144199
std,7.749274,11.163954,377.813504,5.925928
min,-21.982201,-25.363899,524.210999,54.441101
25%,1.59364,-9.9556,2131.350098,84.752502
50%,7.06533,-0.769889,2322.459961,90.093803
75%,13.751,8.78778,2502.469971,93.8209
max,22.7673,26.991199,3314.679932,102.327003


### First looking at release speed, the max value seems accurate but the minimum value of 54 is likely not valueable for our analysis. This is likely a position player pitching and we would like to remove this from our data.

In [34]:
data.RELEASE_SPEED.mean() - 3*data.RELEASE_SPEED.std()

71.36641531699583

In [35]:
data_velo = data[['INNING_KEY','RELEASE_SPEED']]
data_velo[data_velo.RELEASE_SPEED<71]

Unnamed: 0,INNING_KEY,RELEASE_SPEED
1398,9,57.4496
1399,9,62.946301
1400,9,54.7971
1401,9,61.536701
1402,9,59.368599
1403,9,62.2136
1404,9,56.6446
1405,9,59.9897
1406,9,61.851299
1407,9,65.429703


### Using the methodology of considering 3 standard deviations less than the mean as outliers, we find this minimum value to be 71. However, there are some curveballs in the MLB that are thrown below 75 mph such as Zack Greinkes eephus, so we would like to include these pitches. If we change the threshold to 68 mph, we only get pitches that were thrown in the 9th inning which will be position players pitching in blowouts

In [36]:
data_cleaned = data[data.RELEASE_SPEED>65]

### Look at the horizontal and vertical break variables now

### For horizontal break, it will be a bit harder to use the three standard deviation method since the data is not normalized for handedness. For this variable, we will keep all observations. 

### For vertical break, these numbers seem reasonable so we keep them as well. 

### Next, look at the different pitch types

In [37]:
data_cleaned.PITCH_TYPE_TRACKED_KEY.value_counts()

PITCH_TYPE_TRACKED_KEY
FB    3272
SW    1716
CF    1307
CH    1198
SL     909
SI     815
CB     469
SF     173
UN       4
Name: count, dtype: int64

In [38]:
data_cleaned = data_cleaned.loc[~data_cleaned['PITCH_TYPE_TRACKED_KEY'].isin(['UN', 'KN'])]

### Get rid of rows that contain unknown pitches and knuckleballs since they arent very common and we cannot gather enough information about them

# 2. Feature Engineering

### For runners on base, knowing which base is occupied is likely not important. We would just like to know if there is anyone on base or if it is bases empty so that we can know if the pitcher is pitching from the stretch or the wind up

In [39]:
data_cleaned['Runner_On'] = data_cleaned.apply(lambda row: 1 if ((row['IS_RUNNER_ON_1B']==1) or (row['IS_RUNNER_ON_2B']==1) or (row['IS_RUNNER_ON_3B']==1)) else 0, axis = 1)

### Create a new categorical variable that represents the count type (Full, 2 strike, ahead, neutral)

In [40]:
def pitch_category(row):
    if ((row['BALLS'] == 3) and (row['STRIKES'] == 2)):
        return "Full"
    elif (((row['BALLS'] == 2) and (row['STRIKES'] == 0)) or ((row['BALLS'] == 3) and (row['STRIKES'] == 0)) or ((row['BALLS'] == 3) and (row['STRIKES'] == 1))) :
        return "Behind"
    elif (((row['BALLS'] == 0) and (row['STRIKES'] == 2)) or ((row['BALLS'] == 1) and (row['STRIKES'] == 2)) or ((row['BALLS'] == 2) and (row['STRIKES'] == 2))):
        return "2-Strikes"
    else:
        return "Neutral"
    
data_cleaned['Pitch_Category'] = data_cleaned.apply(pitch_category,axis = 1)

In [41]:
data_cleaned.drop(columns = ["BALLS","STRIKES","IS_RUNNER_ON_1B","IS_RUNNER_ON_2B","IS_RUNNER_ON_3B"],inplace=True)

In [42]:
data_cleaned.columns

Index(['PID', 'INNING_KEY', 'BATTER_IN_INNING_KEY', 'PITCH_NUMBER', 'OUT_KEY',
       'PITCHER_KEY', 'THROW_SIDE_KEY', 'PITCH_TYPE_TRACKED_KEY',
       'EVENT_RESULT_KEY', 'PITCH_RESULT_KEY', 'INDUCED_VERTICAL_BREAK',
       'HORIZONTAL_BREAK', 'SPIN_RATE_ABSOLUTE', 'RELEASE_SPEED',
       'RELEASE_SIDE', 'RELEASE_HEIGHT', 'RELEASE_EXTENSION',
       'HORIZONTAL_APPROACH_ANGLE', 'VERTICAL_APPROACH_ANGLE', 'PLATE_X',
       'PLATE_Z', 'Runner_On', 'Pitch_Category'],
      dtype='object')

In [43]:
data_cleaned[['PITCH_TYPE_TRACKED_KEY','THROW_SIDE_KEY','HORIZONTAL_BREAK']].tail(30)

Unnamed: 0,PITCH_TYPE_TRACKED_KEY,THROW_SIDE_KEY,HORIZONTAL_BREAK
9859,FB,R,-7.68155
9860,CH,R,-14.1931
9861,CH,R,-15.329
9862,FB,R,-5.03375
9863,FB,R,-3.96831
9864,CH,R,-16.392401
9865,CF,R,2.83171
9866,CH,R,-14.0531
9867,FB,R,-7.43657
9868,CH,R,-14.5779


### Normalize Horizontal Break so that a negative value means armside run and a positive value means gloveside cut 

In [44]:
def normalize_horizontal_break(row):
    if row["THROW_SIDE_KEY"] == "L":
        return row["HORIZONTAL_BREAK"] * -1
    else:
        return row["HORIZONTAL_BREAK"]
    
data_cleaned["HORIZONTAL_BREAK_NORMALIZED"] = data_cleaned.apply(normalize_horizontal_break,axis=1)

### Create a new variable that is the difference between mean spin rate grouped by pitcher and pitch type and the actual spin rate value. Also create a variable for std based on the same groupings. Do this for spin rate, induced vertical break, and induced horizontal break.

In [45]:
mean_and_std = data_cleaned.groupby(['PITCHER_KEY', 'PITCH_TYPE_TRACKED_KEY'])[['SPIN_RATE_ABSOLUTE','INDUCED_VERTICAL_BREAK', 'HORIZONTAL_BREAK_NORMALIZED']].agg(['mean','std']).reset_index()
mean_and_std.columns = ['_'.join(col).strip('_') for col in mean_and_std.columns.values]
data_cleaned = data_cleaned.merge(mean_and_std, on=['PITCHER_KEY', 'PITCH_TYPE_TRACKED_KEY'])

# # Calculating the difference over the mean spin rate by pitcher and pitch type
data_cleaned['DIFF_MEAN_SPIN_RATE'] = abs(data_cleaned['SPIN_RATE_ABSOLUTE'] - data_cleaned['SPIN_RATE_ABSOLUTE_mean'])

data_cleaned['VERTICAL_BREAK_DIFF'] = abs(data_cleaned['INDUCED_VERTICAL_BREAK'] - data_cleaned['INDUCED_VERTICAL_BREAK_mean'])
data_cleaned['HORIZONTAL_BREAK_DIFF'] = abs(data_cleaned['HORIZONTAL_BREAK_NORMALIZED'] - data_cleaned['HORIZONTAL_BREAK_NORMALIZED_mean'])

# 4. Naive Approach

### The naive approach would be to use this DIFF_MEAN_SPIN_RATE,VERTICAL_BREAK_DIFF, and HORIZONTAL_BREAK_DIFF directly to create a probability value. This can be done by normalizing the values by subtracting each value by the min value and dividing by the difference between min and max value. This will give us a value between 0 and 1 for each of the three differences. If we take the average of all 3 values, we can have our newly crated probabibility variable

In [46]:
min_and_max = data_cleaned.groupby(['PITCHER_KEY', 'PITCH_TYPE_TRACKED_KEY'])[['DIFF_MEAN_SPIN_RATE','VERTICAL_BREAK_DIFF', 'HORIZONTAL_BREAK_DIFF']].agg(['min','max']).reset_index()
min_and_max.columns = ['_'.join(col).strip('_') for col in min_and_max.columns.values]
data_cleaned = data_cleaned.merge(min_and_max, on=['PITCHER_KEY', 'PITCH_TYPE_TRACKED_KEY'])

data_cleaned['NORMAlIZED_SPIN_RATE_DIFF'] = (data_cleaned['DIFF_MEAN_SPIN_RATE'] - data_cleaned['DIFF_MEAN_SPIN_RATE_min']) / (data_cleaned['DIFF_MEAN_SPIN_RATE_max'] - data_cleaned['DIFF_MEAN_SPIN_RATE_min']) 

data_cleaned['NORMAlIZED_VERTICAL_BREAK_DIFF'] = (data_cleaned['VERTICAL_BREAK_DIFF'] - data_cleaned['VERTICAL_BREAK_DIFF_min']) / (data_cleaned['VERTICAL_BREAK_DIFF_max'] - data_cleaned['VERTICAL_BREAK_DIFF_min']) 

data_cleaned['NORMAlIZED_HORIZONTAL_BREAK_DIFF'] = (data_cleaned['HORIZONTAL_BREAK_DIFF'] - data_cleaned['HORIZONTAL_BREAK_DIFF_min']) / (data_cleaned['HORIZONTAL_BREAK_DIFF_max'] - data_cleaned['HORIZONTAL_BREAK_DIFF_min']) 


In [49]:
data_cleaned['NAIVE_PROBABILITY'] = (data_cleaned['NORMAlIZED_SPIN_RATE_DIFF'] + data_cleaned['NORMAlIZED_VERTICAL_BREAK_DIFF'] + data_cleaned['NORMAlIZED_HORIZONTAL_BREAK_DIFF']) / 3

# 5. Define Target Variable

### We will define a pitch affected by humidity as a pitch that has a spin rate that is more than one standard deviation less or greater than than the mean. We include pitches one standard deviation less because less spin rate means the pitcher does not have a good grip on the ball. This loss of grip can be attributted to humidity/the ball being slippery. We include pitches one standard deviation greater than the mean because when the air is humid, the air density is lower and therefore it allows spin rates to be faster according to this article: https://sabr.org/journal/article/how-climate-change-will-affect-baseball/

### This one standard deviation value is arbitrary but since around 68% of data falls within 1 standard deviation of the mean, this gives us a relatively balanced dataset to work with. 

In [50]:
data_cleaned['DEWPOINT_AFFECTED'] = (abs(data_cleaned['DIFF_MEAN_SPIN_RATE']) > data_cleaned['SPIN_RATE_ABSOLUTE_std']).astype(int)
data_cleaned['DEWPOINT_AFFECTED'].value_counts(normalize=True)

DEWPOINT_AFFECTED
0    0.706157
1    0.293843
Name: proportion, dtype: float64

### As mentioned, the dataset is relatively balanced with 70% not being affected by humidity and 30% being affected. 

### We can now test the accuracy of our naive method compared to our newly created DEWPOINT_AFFECTED variable

In [51]:
data_cleaned['NAIVE_PREDICTION'] = data_cleaned['NAIVE_PROBABILITY'].apply(lambda x: 1 if x>0.5 else 0)

In [54]:
accuracy_score(data_cleaned['NAIVE_PREDICTION'],data_cleaned['DEWPOINT_AFFECTED'])

0.7355715589816412

### Accuracy score is not a great way to evaluate any model we choose since the data is imbalanced

### I imagine that the next steps of a project like this would be to see which pitchers are affected most by humidity. This would help with important decisions such as lining up the rotation so pitchers who do worse in humidity do not have to pitch in those conditions or making sure that rookies who are affected by humidity are not called up to pitch in conditions where they are less likely to succeed since that will negatively impact their confidence. In this case it is better to be safe than sorry, so we would like as many pitches as possible that are actually affected by humidity to be classified as such. In this case Recall would be more important and we would not care about false possitives as much. 

### Another next step would be to determine which pitchers struggle in humid conditions in order to help them make adjustments. In this case, we would want to keep false positives low so that we only try to help pitchers who actually need it since making too many adjustments to a pitcher may be problematic. In this case, we would prioritize Precision


In [None]:
def precision_recall_curve(probabilities,actual):
    

# 5. Initial Feature Selection

In [302]:
data_cleaned.columns

Index(['PID', 'INNING_KEY', 'BATTER_IN_INNING_KEY', 'PITCH_NUMBER', 'OUT_KEY',
       'PITCHER_KEY', 'THROW_SIDE_KEY', 'PITCH_TYPE_TRACKED_KEY',
       'EVENT_RESULT_KEY', 'PITCH_RESULT_KEY', 'INDUCED_VERTICAL_BREAK',
       'HORIZONTAL_BREAK', 'SPIN_RATE_ABSOLUTE', 'RELEASE_SPEED',
       'RELEASE_SIDE', 'RELEASE_HEIGHT', 'RELEASE_EXTENSION',
       'HORIZONTAL_APPROACH_ANGLE', 'VERTICAL_APPROACH_ANGLE', 'PLATE_X',
       'PLATE_Z', 'Runner_On', 'Pitch_Category', 'HORIZONTAL_BREAK_NORMALIZED',
       'SPIN_RATE_ABSOLUTE_mean', 'SPIN_RATE_ABSOLUTE_std',
       'INDUCED_VERTICAL_BREAK_mean', 'INDUCED_VERTICAL_BREAK_std',
       'HORIZONTAL_BREAK_NORMALIZED_mean', 'HORIZONTAL_BREAK_NORMALIZED_std',
       'DIFF_MEAN_SPIN_RATE', 'VERTICAL_BREAK_DIFF', 'HORIZONTAL_BREAK_DIFF',
       'DEWPOINT_AFFECTED'],
      dtype='object')

### Ideally here we would have pitcher fatigue to use as a feature. INNING_KEY could possibly be used but in the later innings we do not know if the pitcher is a starter pitching deep into the game or a relief pitcher, so it would be best not to include it. 

### BATTER_IN_INNING_KEY could show pitcher fatigue so we opt to include it, the same can be said for PITCH_NUMBER

### OUT_KEY can be converted to categorical and included

### PITCHER_KEY and PITCH_TYPE_TRACKED_KEY are important since they tell a lot about what the pitch characterstics should be. 

### The remaining variables below can be tested to see which are valuable for predictions

In [303]:
X_all = data_cleaned[['BATTER_IN_INNING_KEY', 'PITCH_NUMBER', 'OUT_KEY', 'PITCHER_KEY', 'THROW_SIDE_KEY', 'PITCH_TYPE_TRACKED_KEY',
                      'PITCH_RESULT_KEY', 'RELEASE_SPEED','RELEASE_SIDE', 'RELEASE_HEIGHT', 'RELEASE_EXTENSION','HORIZONTAL_APPROACH_ANGLE', 'VERTICAL_APPROACH_ANGLE', 'PLATE_X',
                      'PLATE_Z', 'Runner_On', 'Pitch_Category', 'VERTICAL_BREAK_DIFF','HORIZONTAL_BREAK_DIFF']]

y = data_cleaned['DEWPOINT_AFFECTED']

# 5. Encode and Standardize the Data, then run RandomForestClassifier for feature importance

In [304]:
numeric_features = ['BATTER_IN_INNING_KEY', 'PITCH_NUMBER', 'RELEASE_SPEED', 'RELEASE_HEIGHT', 'RELEASE_EXTENSION','HORIZONTAL_APPROACH_ANGLE', 'VERTICAL_APPROACH_ANGLE', 'PLATE_X',
                    'PLATE_Z', 'VERTICAL_BREAK_DIFF','HORIZONTAL_BREAK_DIFF']

categorical_features = ['OUT_KEY','PITCHER_KEY','THROW_SIDE_KEY', 'PITCH_TYPE_TRACKED_KEY', 'PITCH_RESULT_KEY','RELEASE_SIDE','Runner_On', 'Pitch_Category']

# make sure the features we want to be categorical are string data types
for col in categorical_features:
    X_all[col] = X_all[col].astype(str)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.3, random_state=42)

# create the transformer pipeline that uses StandardScaler to standardize the numeric variables and uses one hot encoding
# to encode the categorical variables 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)    

# Crea a pipeline with preprocessor and a RandomForest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', rf)])

# Fit the model
pipeline.fit(X_train, y_train)

In [305]:
# ! pip install --upgrade scikit-learn

feature_importances = pipeline.named_steps['classifier'].feature_importances_

# Gettfeature names from the preprocessor, including all one hot encoded names
ohe_feature_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(input_features=categorical_features)
all_feature_names = numeric_features + list(ohe_feature_names)

# Creating a DataFrame for feature importances
feature_importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

feature_importance_df.head(15)  # Display top 10 important features

Unnamed: 0,Feature,Importance
10,HORIZONTAL_BREAK_DIFF,0.044443
9,VERTICAL_BREAK_DIFF,0.044035
2,RELEASE_SPEED,0.043821
6,VERTICAL_APPROACH_ANGLE,0.043817
5,HORIZONTAL_APPROACH_ANGLE,0.043812
8,PLATE_Z,0.04361
3,RELEASE_HEIGHT,0.042709
7,PLATE_X,0.041864
4,RELEASE_EXTENSION,0.041198
1,PITCH_NUMBER,0.023824


### We will use an importane threshold of 0.04 and include all variables more important than this. 

# 6. Modeling

## Random Forrest Classifier

In [306]:
X = X_all[list(feature_importance_df.Feature)[:9]]

scaler = StandardScaler()

# Fit and transform the data
X_scaled = scaler.fit_transform(X)

X_train_model, X_test_model, y_train_model, y_test_model = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

rf.fit(X_train_model,y_train_model)

In [307]:

y_pred = rf.predict(X_test_model)
accuracy = accuracy_score(y_test_model, y_pred)
accuracy

0.6889790398918187

In [308]:
y_test_model[:20]

8723    0
500     0
5076    0
872     1
8726    0
5559    0
7403    1
5159    0
1078    0
8563    0
1111    0
3379    0
2392    1
9423    0
7529    0
9047    1
5190    0
106     0
7149    0
33      1
Name: DEWPOINT_AFFECTED, dtype: int64

In [309]:
list(y_pred)[:20]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

## Logistic Regression Model

In [310]:

# Create and fit the model
logistic_model = LogisticRegression(random_state = 42)
logistic_model.fit(X_train_model,y_train_model)

# Predict and evaluate
y_pred_logistic = logistic_model.predict(X_test_model)
accuracy_logistic = accuracy_score(y_test_model, y_pred_logistic)
print("Accuracy:", accuracy_logistic)

Accuracy: 0.6987829614604463


### Linear Mixed Model

In [311]:
list(feature_importance_df.Feature)[:9]

['HORIZONTAL_BREAK_DIFF',
 'VERTICAL_BREAK_DIFF',
 'RELEASE_SPEED',
 'VERTICAL_APPROACH_ANGLE',
 'HORIZONTAL_APPROACH_ANGLE',
 'PLATE_Z',
 'RELEASE_HEIGHT',
 'PLATE_X',
 'RELEASE_EXTENSION']

In [314]:


model_formula = "DEWPOINT_AFFECTED ~ HORIZONTAL_BREAK_DIFF + VERTICAL_BREAK_DIFF + RELEASE_SPEED + HORIZONTAL_APPROACH_ANGLE + VERTICAL_APPROACH_ANGLE + RELEASE_HEIGHT"

md = sm.MixedLM.from_formula(model_formula, groups="PITCHER_KEY", data=data_cleaned)
mdf = md.fit()

# Output the summary of themodel
print(mdf.summary())

data_new = data_cleaned.copy() 

data_new['predicted_prob'] = mdf.predict(data_new)

# Converting probabilities to class labels based on a 0.5 threshold
data_new['predicted_label'] = np.where(data_new['predicted_prob'] >= 0.5, 1, 0)

# Calculating accuracy
accuracy = accuracy_score(data_new['DEWPOINT_AFFECTED'], data_new['predicted_label'])
print(f"Accuracy: {accuracy}")

               Mixed Linear Model Regression Results
Model:               MixedLM  Dependent Variable:  DEWPOINT_AFFECTED
No. Observations:    9859     Method:              REML             
No. Groups:          37       Scale:               0.2063           
Min. group size:     4        Log-Likelihood:      -6249.4558       
Max. group size:     1187     Converged:           Yes              
Mean group size:     266.5                                          
--------------------------------------------------------------------
                          Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------------------
Intercept                  0.298    0.153  1.948 0.051 -0.002  0.598
HORIZONTAL_BREAK_DIFF      0.011    0.003  3.875 0.000  0.005  0.016
VERTICAL_BREAK_DIFF        0.010    0.003  3.116 0.002  0.004  0.015
RELEASE_SPEED              0.001    0.001  0.789 0.430 -0.002  0.004
HORIZONTAL_APPROACH_ANGLE -0.003    0.003 -1.035 0

In [313]:
data_new

Unnamed: 0,PID,INNING_KEY,BATTER_IN_INNING_KEY,PITCH_NUMBER,OUT_KEY,PITCHER_KEY,THROW_SIDE_KEY,PITCH_TYPE_TRACKED_KEY,EVENT_RESULT_KEY,PITCH_RESULT_KEY,...,INDUCED_VERTICAL_BREAK_mean,INDUCED_VERTICAL_BREAK_std,HORIZONTAL_BREAK_NORMALIZED_mean,HORIZONTAL_BREAK_NORMALIZED_std,DIFF_MEAN_SPIN_RATE,VERTICAL_BREAK_DIFF,HORIZONTAL_BREAK_DIFF,DEWPOINT_AFFECTED,predicted_prob,predicted_label
0,197,1,1,1,0,668881,R,FB,field_out,InPlay,...,15.591399,1.802321,-11.883276,2.586075,39.861242,1.049199,0.477024,0,0.185046,0
1,348,1,2,1,1,668881,R,FB,,BallCalled,...,15.591399,1.802321,-11.883276,2.586075,89.138758,1.337598,0.932976,1,0.380899,0
2,328,1,3,3,1,668881,R,FB,,FoulBall,...,15.591399,1.802321,-11.883276,2.586075,25.861242,2.051601,3.136424,0,0.123992,0
3,422,1,3,4,1,668881,R,FB,,BallCalled,...,15.591399,1.802321,-11.883276,2.586075,46.861242,2.942802,2.024224,0,0.208423,0
4,1073,1,3,6,1,668881,R,FB,,FoulBall,...,15.591399,1.802321,-11.883276,2.586075,50.138758,2.728200,1.534424,0,0.227638,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9854,21551,4,5,1,1,680689,R,CB,,StrikeCalled,...,-11.159134,4.332970,5.411148,3.496917,78.222222,1.906966,1.144428,0,0.308062,0
9855,24018,4,7,3,2,680689,R,CB,,StrikeCalled,...,-11.159134,4.332970,5.411148,3.496917,40.222222,1.589585,2.602748,0,0.153492,0
9856,22724,5,1,1,0,680689,R,CB,,StrikeCalled,...,-11.159134,4.332970,5.411148,3.496917,59.222222,5.055804,7.398652,0,0.213183,0
9857,28629,5,2,11,1,680689,R,CB,,FoulBall,...,-11.159134,4.332970,5.411148,3.496917,29.777778,0.280066,1.506412,0,0.113965,0


### Conclusions, we can use the results to determine which players are affected by humidity and which are not. One way to do this would be by looking at the distributions based on velocity 

### Assumptions, any change in spin_rate/ movement is directly caused by dewpoint and not a result of a pitcher slipping or other circumstances. 