# Preferred Results Classification - Data Cleaning

### Imports

In [1]:
import pandas as pd
import numpy as np

### Read in Statcast Data

In [2]:
df = pd.read_csv("all_raw_uncleaned_pitches.csv")
# df

### Drop All of the Unnecessary Columns

In [3]:
inputs  = ["player_name", "pitch_type", "p_throws", "release_pos_x", "release_pos_z", "release_extension", "release_speed", "effective_speed", "release_spin_rate", "spin_axis", "pfx_x", "pfx_z", "plate_x", "plate_z", "batter"]
outputs = ["launch_speed", "description"]

In [4]:
df = df[inputs + outputs]
# df

### Drop All of the NaN Variables

In [5]:
df.drop_duplicates()
df = df.dropna(subset=inputs)
df.drop(df[(df["description"] == "hit_into_play") & (pd.isnull(df["launch_speed"]))].index, inplace = True)
df.drop(df[(pd.isnull(df["description"])) & (df["launch_speed"] >= 0.0)].index, inplace = True)
df.reset_index(inplace = True, drop = True)
# df

### Feature Engineering

##### Adding Pitch Totals

In [6]:
df.loc[df.index, "total_pitches"] = df.groupby("player_name")["player_name"].transform("count")
df.loc[df.index, "pitch_type_total"] = df.groupby(["player_name", "pitch_type"])["pitch_type"].transform("count")
df.loc[df.index, "pitch_type_percentage"] = df["pitch_type_total"] / df["total_pitches"]
df.loc[df.index, "primary_pitch_percentage"] = df.groupby(["player_name", "pitch_type"])["pitch_type_percentage"].transform("max")
df.loc[df.index, "primary_pitch_percentage"] = df.groupby("player_name")["primary_pitch_percentage"].transform("max")
# df

##### Adding Batter Metrics

In [7]:
batter_df = pd.read_csv("stats (2).csv")
df = df.merge(batter_df, how = "left", on = "batter")
# df

##### Adds Vertical and Horizontal Approach Angle

- The equation for VAA is: $$ \arctan(\frac{Release Position Height - Plate Height}{60.5 - Release Extension}) $$
- The equation for HAA is: $$ \arctan(\frac{Release Position (X-axis) - Plate X-axis location}{60.5 - Release Extension}) $$

In [8]:
df.loc[df.index, "vertical_approach_angle"] = np.arctan((df.loc[df.index, "release_pos_z"] - df.loc[df.index, "plate_z"]) / (60.5 - df.loc[df.index, "release_extension"]))
df.loc[df.index, "horizontal_approach_angle"] = np.arctan((df.loc[df.index, "release_pos_x"] - df.loc[df.index, "plate_x"]) / (60.5 - df.loc[df.index, "release_extension"]))
# df

##### Adds the Release Speed, Spin Axis, and Horizontal & Vertical Movement Differences Input Variables
- These variables track the difference in pitch speed, spin axis, and horizontal & vertical movement from the pitcher's current pitch to the pitcher's most thrown pitch.

In [9]:
differences = ["release_speed", "spin_axis", "pfx_x", "pfx_z"]
for difference in differences:
    df["primary_{}".format(difference)] = df.groupby(["player_name", "pitch_type"])["{}".format(difference)].transform("mean")
    df.loc[~(df["pitch_type_percentage"] == df["primary_pitch_percentage"]), "primary_{}".format(difference)] = 0
    df["primary_{}".format(difference)] = df.groupby("player_name")["primary_{}".format(difference)].transform("max")
    df["{}_difference".format(difference)] = df["primary_{}".format(difference)] - df["{}".format(difference)]
    df.loc[df["pitch_type_percentage"] == df["primary_pitch_percentage"], "{}_difference".format(difference)] = 0
    df = df.drop("primary_{}".format(difference), axis=1)
# df

##### Adds the "Preferred Results" Output Variable

In [10]:
df.loc[df.index, "preferred_results"] = 0
df.loc[df["description"] == "called_strike", "preferred_results"] = 1
df.loc[df["description"] == "swinging_strike", "preferred_results"] = 1
df.loc[df["description"] == "swinging_strike_blocked", "preferred_results"] = 1
df.loc[(df["description"] == "hit_into_play") & (df["launch_speed"] < 80), "preferred_results"] = 1
# df

##### One Hot Encodes Handedness and Pitch Type

In [11]:
df.loc[df["pitch_type"] == "SC", "pitch_type"] = "CH"
df = pd.get_dummies(df, columns=['pitch_type', 'p_throws'], drop_first=True)
# df

### Drop Uneeded Columns

In [12]:
df = df.drop(["launch_speed", "description"], axis=1)
df = df.fillna(-1)
# df

### Store the DataFrame in a Feather File

In [13]:
df.to_feather("cleaned_data.feather")