<a href="https://colab.research.google.com/github/skbetz54/MLB_Statcast_Project/blob/main/Statcast_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Statcast Project - Data Cleaning

The purpose of this Colab notebook is to provide a separate notebook for the necessary cleaning of the dataset to work with in the "Statcast - Classification" notebook. The raw file imported into the "Statcast - Classification" notebook is simply the original CSV files (from the path in this notebook) that has run through all of this notebook's cells.

We will be cleaning two data files: one for the 2019 Baltimore Orioles' season and one for the 2019 Los Angeles Angels' season. 

## **2019 Baltimore Orioles**

In [None]:
import pandas as pd
#raw github file where data is stored
url = "https://raw.githubusercontent.com/skbetz54/MLB_Statcast_Project/main/Data/baltimore_2019.csv"

df = pd.read_csv(url, header=0)

# Dropping deprecated columns that hold no relevant information
df = df.drop(columns=['spin_rate_deprecated','break_angle_deprecated','break_length_deprecated','umpire','pitcher.1','fielder_2.1','fielder_3','fielder_4','fielder_5','fielder_6',
              'fielder_7','fielder_8','fielder_9', 'tfs_deprecated','tfs_zulu_deprecated', 'fielder_2','post_away_score','post_home_score','spin_dir',
              "of_fielding_alignment","if_fielding_alignment","post_fld_score","post_bat_score","fld_score","bat_score","on_1b","on_2b","on_3b","game_year","home_team","away_team"])

# Specifying the various df['events'] occurrences which will remain in the dataframe
events_types = ['single','double','triple','home_run','strikeout','field_out','field_error','fielders_choice_out','force_out','sac_bunt','grounded_into_double_play',
                'strikeout_double_play','sac_fly','double_play','fielders_choice','sac_fly_double_play','other_out']
# Specifying the df['events'] occurrences which will be called a hit
hit_types = ['single','double','triple','home_run']

# deleting rows in which df['events'] or df['pitch_type'] are null, since they are important for analysis (i.e. the middle of an at bat, so no "event" has occurred)
df = df[df.events.notnull()]
df = df[df.pitch_type.notnull()]

for i in df.index:
  if df['events'][i] in events_types:
    continue
  else:
    df.drop(labels= i, inplace = True, errors = 'ignore')

df = df.reset_index()
del df['index']

# Creating a "opp_hand" variable to determine if the batter and pitcher use the same hand
# 0 means the both batter and pitcher are left (right) handed, 1 means they are opposite.

handval = [0 if df['stand'][i] == df['p_throws'][i] else 1 for i in df.index]

df['opp_hand'] = handval

In [None]:
# Using sklearn.OneHotEncoder to change the "pitch_type" categorical variable to a workable format
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

#Fitting the encoder to our "pitch_type" variable and creating a dataframe
enc_df = pd.DataFrame(enc.fit_transform(df[['pitch_type']]).toarray())
a = list(enc.get_feature_names())
enc_df.columns = a

#combining the encoded dataframe back into the original
df = df.join(enc_df)

In [None]:
# Creating our target variable, "HitClf", which will be 1 if the outcome is a hit and 0 otherwise
hitval = [1 if df['events'][i] in hit_types else 0 for i in df.index]

df['HitClf'] = hitval

In [None]:
# There are many descriptor columns in the dataset that give no information and cannot be used in modeling.
# This cell gets rid of all columns that have the dtype "object".
df = df.select_dtypes(exclude = 'object') 

In [None]:
df.to_csv(r'C:\Users\Owner\Documents\Grad School\DATA 602\Final Project\baltimore_cleaned.csv')

## **2019 Los Angeles Angels**
This involves the same steps as the Orioles dataset, as they are both obtained from the same source

In [None]:
import pandas as pd
#raw github file where data is stored
url = "https://raw.githubusercontent.com/skbetz54/MLB_Statcast_Project/main/Data/LA_2019.csv"

df = pd.read_csv(url, header=0)

# Dropping deprecated columns that hold no relevant information
df = df.drop(columns=['spin_rate_deprecated','break_angle_deprecated','break_length_deprecated','umpire','pitcher.1','fielder_2.1','fielder_3','fielder_4','fielder_5','fielder_6',
              'fielder_7','fielder_8','fielder_9', 'tfs_deprecated','tfs_zulu_deprecated', 'fielder_2','post_away_score','post_home_score','spin_dir',
              "of_fielding_alignment","if_fielding_alignment","post_fld_score","post_bat_score","fld_score","bat_score","on_1b","on_2b","on_3b","game_year","home_team","away_team"])

# Specifying the various df['events'] occurrences which will remain in the dataframe
events_types = ['single','double','triple','home_run','strikeout','field_out','field_error','fielders_choice_out','force_out','sac_bunt','grounded_into_double_play',
                'strikeout_double_play','sac_fly','double_play','fielders_choice','sac_fly_double_play','other_out']
# Specifying the df['events'] occurrences which will be called a hit
hit_types = ['single','double','triple','home_run']

# deleting rows in which df['events'] or df['pitch_type'] are null, since they are important for analysis (i.e. the middle of an at bat, so no "event" has occurred)
df = df[df.events.notnull()]
df = df[df.pitch_type.notnull()]

for i in df.index:
  if df['events'][i] in events_types:
    continue
  else:
    df.drop(labels= i, inplace = True, errors = 'ignore')

df = df.reset_index()
del df['index']

# Creating a "opp_hand" variable to determine if the batter and pitcher use the same hand
# 0 means the both batter and pitcher are left (right) handed, 1 means they are opposite.

handval = [0 if df['stand'][i] == df['p_throws'][i] else 1 for i in df.index]

df['opp_hand'] = handval

In [None]:
# Using sklearn.OneHotEncoder to change the "pitch_type" categorical variable to a workable format
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

#Fitting the encoder to our "pitch_type" variable and creating a dataframe
enc_df = pd.DataFrame(enc.fit_transform(df[['pitch_type']]).toarray())
a = list(enc.get_feature_names())
enc_df.columns = a

#combining the encoded dataframe back into the original
df = df.join(enc_df)

In [None]:
 # Creating our target variable, "HitClf", which will be 1 if the outcome is a hit and 0 otherwise
hitval = [1 if df['events'][i] in hit_types else 0 for i in df.index]

df['HitClf'] = hitval

In [None]:
# df.to_csv(r'C:\Users\Owner\Documents\Grad School\DATA 602\Final Project\LA_2019_cleaned.csv')