In [4]:
import sys
import os
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings("ignore", category=FutureWarning)

# Add the src directory to Python path
sys.path.append(os.path.abspath('../src'))

from data_cleaning import combine_pitcher_data, drop_null_columns, handle_missing_efficient, rename_columns, combine_pitcher_data_testing


In [None]:
# combine all csv files from the raw/pitcher directory
test_data = combine_pitcher_data()

In [3]:
# check dimensions of the combined data
test_data.shape

(3073582, 113)

In [4]:
# identify columns with all missing values
missing_cols = test_data.columns[test_data.isnull().all()].tolist()
missing_cols

['spin_dir',
 'spin_rate_deprecated',
 'break_angle_deprecated',
 'break_length_deprecated',
 'tfs_deprecated',
 'tfs_zulu_deprecated',
 'umpire',
 'sv_id']

In [5]:
# drop columns that are completely null
test_data = drop_null_columns(test_data)
test_data.shape

(3073582, 103)

In [None]:
# save the data to a csv file
# test_data.to_csv('../data/processed/pitcher_combined_21-24.csv', index=False)

In [6]:
fill_missing = handle_missing_efficient(test_data)

# check for missing values
missing_values = fill_missing.isnull().sum()
missing_values[missing_values > 0]

Series([], dtype: int64)

In [7]:
cleaned_df = rename_columns(fill_missing)

In [8]:
cleaned_df.to_csv('../data/processed/pitcher_cleaned_21-24.csv', index=False)

In [9]:
cleaned_df.shape

(3073582, 103)

In [10]:
cleaned_df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,zone,...,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
0,FF,2021-04-30,95.1,-2.62,6.42,"Alcala, Jorge",656811,660896,field_out,6.0,...,1,3,3.0,4.0,2.0,1.0,1.15,0.75,-0.75,40.6
1,FF,2021-04-30,96.3,-2.58,6.5,"Alcala, Jorge",593643,660896,field_out,2.0,...,1,0,3.0,2.0,2.0,1.0,1.07,0.76,0.76,43.0
2,FF,2021-04-30,94.9,-2.58,6.44,"Alcala, Jorge",593643,660896,N/a,12.0,...,1,0,3.0,2.0,2.0,1.0,1.19,0.87,0.87,42.0
3,FF,2021-04-30,94.7,-2.68,6.38,"Alcala, Jorge",593643,660896,N/a,9.0,...,1,0,3.0,2.0,2.0,1.0,1.23,0.98,0.98,39.9
4,FF,2021-04-30,93.8,-2.71,6.3,"Alcala, Jorge",595956,660896,field_out,9.0,...,1,0,3.0,4.0,2.0,1.0,1.31,0.77,0.77,39.8


# 2025

In [5]:
test_data_2025 = combine_pitcher_data_testing()
test_data_2025.shape

(173056, 113)

In [6]:
# identify columns with all missing values
missing_cols = test_data_2025.columns[test_data_2025.isnull().all()].tolist()
missing_cols

['spin_dir',
 'spin_rate_deprecated',
 'break_angle_deprecated',
 'break_length_deprecated',
 'tfs_deprecated',
 'tfs_zulu_deprecated',
 'umpire',
 'sv_id']

In [7]:
# drop columns that are completely null
test_data_2025 = drop_null_columns(test_data_2025)
test_data_2025.shape

(173056, 103)

In [None]:
# save the data to a csv file
# test_data.to_csv('../data/processed/pitcher_combined_21-24.csv', index=False)

In [8]:
fill_missing_2025 = handle_missing_efficient(test_data_2025)

# check for missing values
missing_values_2025 = fill_missing_2025.isnull().sum()
missing_values_2025[missing_values_2025 > 0]

Series([], dtype: int64)

In [9]:
cleaned_df_2025 = rename_columns(fill_missing_2025)

In [10]:
cleaned_df_2025.to_csv('../data/processed/pitcher_cleaned_2025.csv', index=False)

In [11]:
cleaned_df_2025.shape

(173056, 103)

In [12]:
cleaned_df_2025.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,zone,...,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
0,ST,2025-04-29,75.1,2.69,5.01,"Cosgrove, Tom",643396,676680,field_out,13.0,...,1,3,0.0,0.0,0.0,0.0,3.97,-1.19,1.19,0.0
1,SI,2025-04-29,85.1,2.21,5.25,"Cosgrove, Tom",643396,676680,N/a,3.0,...,1,3,0.0,0.0,0.0,0.0,2.61,1.01,-1.01,0.0
2,ST,2025-04-29,76.1,2.65,4.95,"Cosgrove, Tom",643396,676680,N/a,13.0,...,1,3,0.0,0.0,0.0,0.0,3.94,-1.36,1.36,0.0
3,ST,2025-04-29,76.5,2.53,5.06,"Cosgrove, Tom",643396,676680,N/a,2.0,...,1,3,0.0,0.0,0.0,0.0,3.73,-1.18,1.18,0.0
4,FF,2025-04-29,89.8,2.26,5.4,"Cosgrove, Tom",669707,676680,field_out,5.0,...,1,3,0.0,0.0,0.0,0.0,1.65,0.56,-0.56,0.0
