In [1]:
import polars as pl
import pandas as pd
import numpy as np
import time
import datetime

pl.Config.set_tbl_cols(200)
pl.Config.set_tbl_rows(200)

polars.config.Config

## Load Data


In [2]:
main_df = pl.read_parquet(f'../raw_data/ballasted_statcast_data.parquet')
def_df = pl.read_parquet(f'../clean_data/clean_defensive_stats.parquet')
park_df = pl.read_parquet(f'../clean_data/clean_park_factors.parquet')

### Filter to 1st 3 innings and only include necessary columns

In [3]:
model_df = (
    main_df
    .filter(pl.col('inning') <= 3)
    .select(
        'game_pk',
        'batter',
        'pitcher',
        'is_batter_home',
        'home_team',
        'game_date',
        pl.col('game_year').alias('year'),
        'fielder_2',
        'fielder_3',
        'fielder_4',
        'fielder_5',
        'fielder_6',
        'fielder_7',
        'fielder_8',
        'fielder_9',
        'pa_outcome_category',
        'is_platoon_adv',
        'pitcher_avg_a_daily_input',
        'pitcher_k_pct_a_daily_input',
        'pitcher_bb_pct_a_daily_input',
        'pitcher_hbp_pct_a_daily_input',
        'pitcher_1b_pct_a_daily_input',
        'pitcher_2b_pct_a_daily_input',
        'pitcher_3b_pct_a_daily_input',
        'pitcher_hr_pct_a_daily_input',
        'pitcher_non_k_out_pct_a_daily_input',
        'batter_avg_daily_input',
        'batter_k_pct_daily_input',
        'batter_bb_pct_daily_input',
        'batter_hbp_pct_daily_input',
        'batter_1b_pct_daily_input',
        'batter_2b_pct_daily_input',
        'batter_3b_pct_daily_input',
        'batter_hr_pct_daily_input',
        'batter_non_k_out_pct_daily_input',
    )
)

## Join in Defense

In [4]:
def_df = (
    def_df
    .select(
        'player_id',
        'year',
        'outs_above_average_per_inning',
    )
)

In [5]:
pa_df_with_defense = model_df.clone()

oaa_cols_list = []
for i in range(2, 10): # Loop from fielder_2 to fielder_9
    fielder_col = f"fielder_{i}" # e.g., 'fielder_2'
    oaa_col_name = f"f{i}_oaa_per_inning" # e.g., 'oaa_p2_prior'

    oaa_cols_list.append(oaa_col_name)

    # Perform the left join for the current fielder
    pa_df_with_defense = (
        pa_df_with_defense
        .join(
            def_df,
            left_on=[fielder_col, "year"],
            right_on=["player_id", "year"],
            how="left",
        )
        .rename({"outs_above_average_per_inning": oaa_col_name})
        .with_columns(pl.col(oaa_col_name).fill_null(0))
    )


pa_df_with_defense = (
    pa_df_with_defense
    .with_columns(
        pl.sum_horizontal(oaa_cols_list).alias("team_defense_oaa_input")
    )
)

## Add in Park Factors

In [6]:
team_mapping = {
    'BOS': 'Red Sox',
    'MIN': 'Twins',
    'LAD': 'Dodgers',
    'CLE': 'Guardians',
    'SD': 'Padres',
    'KC': 'Royals',
    'CWS': 'White Sox',
    'WSH': 'Nationals',
    'TB': 'Rays',
    'AZ': 'D-backs',
    'MIL': 'Brewers',
    'STL': 'Cardinals',
    'DET': 'Tigers',
    'TEX': 'Rangers',
    'OAK': 'Athletics',
    'ATL': 'Braves',
    'PHI': 'Phillies',
    'CIN': 'Reds',
    'NYY': 'Yankees',
    'SEA': 'Mariners',
    'NYM': 'Mets',
    'LAA': 'Angels',
    'MIA': 'Marlins',
    'SF': 'Giants',
    'PIT': 'Pirates',
    'COL': 'Rockies',
    'HOU': 'Astros',
    'BAL': 'Orioles',
    'TOR': 'Blue Jays',
    'CHC': 'Cubs'
}

mapping_df = (
    pl.DataFrame({
        "team_name": list(team_mapping.values()),
        "team_abbr": list(team_mapping.keys()),
    })
)

In [7]:
df_with_abbr = (
    park_df
    .join(
        mapping_df,
        left_on="name_display_club",
        right_on="team_name",
        how="left",
    )
    .filter(pl.col("team_abbr").is_not_null())
)

In [8]:
df_park_factors_final = (
    df_with_abbr
    .with_columns(
        (pl.col("year") + 1).alias("year_to_join"),
        pl.col("park_factor").fill_null(100.0).alias("park_factor_filled"),
    )
    .select(
        pl.col("team_abbr"), # This will match 'home_team' abbreviation
        pl.col("year_to_join"),
        pl.col("park_factor_filled").alias("park_factor_input") # The value to use
    )
    .unique()
    .sort("team_abbr", "year_to_join")
)



final_model_df = (
    pa_df_with_defense
    .join(
        df_park_factors_final,
        left_on=["home_team", "year"], # Keys from PA data
        right_on=["team_abbr", "year_to_join"], # Keys from prepared park factor data
        how="left"
    )
    .select(
        'game_pk',
        'batter',
        'pitcher',
        'home_team',
        'is_batter_home',
        'game_date',
        'year',
        'pa_outcome_category',
        'is_platoon_adv',
        'pitcher_avg_a_daily_input',
        'pitcher_k_pct_a_daily_input',
        'pitcher_bb_pct_a_daily_input',
        'pitcher_hbp_pct_a_daily_input',
        'pitcher_1b_pct_a_daily_input',
        'pitcher_2b_pct_a_daily_input',
        'pitcher_3b_pct_a_daily_input',
        'pitcher_hr_pct_a_daily_input',
        'pitcher_non_k_out_pct_a_daily_input',
        'batter_avg_daily_input',
        'batter_k_pct_daily_input',
        'batter_bb_pct_daily_input',
        'batter_hbp_pct_daily_input',
        'batter_1b_pct_daily_input',
        'batter_2b_pct_daily_input',
        'batter_3b_pct_daily_input',
        'batter_hr_pct_daily_input',
        'batter_non_k_out_pct_daily_input',
        'team_defense_oaa_input',
        'park_factor_input'
    )
    .filter(pl.col('year').is_in([2023, 2024]))
)

In [9]:
final_model_df.write_parquet(f'../raw_data/final_model_data.parquet')

In [10]:
# Cell: Import sklearn and create scaler
from sklearn.preprocessing import StandardScaler
import joblib

print("Creating PA Outcome Scaler...")

Creating PA Outcome Scaler...


In [11]:

CONTINUOUS_COLS = [
    "pitcher_k_pct_a_daily_input",
    "pitcher_bb_pct_a_daily_input",
    "pitcher_hbp_pct_a_daily_input", 
    "pitcher_1b_pct_a_daily_input",
    "pitcher_2b_pct_a_daily_input",
    "pitcher_3b_pct_a_daily_input",
    "pitcher_hr_pct_a_daily_input",
    "pitcher_non_k_out_pct_a_daily_input",
    "batter_k_pct_daily_input",
    "batter_bb_pct_daily_input",
    "batter_hbp_pct_daily_input",
    "batter_1b_pct_daily_input", 
    "batter_2b_pct_daily_input",
    "batter_3b_pct_daily_input",
    "batter_hr_pct_daily_input",
    "batter_non_k_out_pct_daily_input",
    "team_defense_oaa_input",
    "park_factor_input",
]

print(f"Continuous columns to scale: {len(CONTINUOUS_COLS)}")
for col in CONTINUOUS_COLS:
    print(f"  - {col}")

Continuous columns to scale: 18
  - pitcher_k_pct_a_daily_input
  - pitcher_bb_pct_a_daily_input
  - pitcher_hbp_pct_a_daily_input
  - pitcher_1b_pct_a_daily_input
  - pitcher_2b_pct_a_daily_input
  - pitcher_3b_pct_a_daily_input
  - pitcher_hr_pct_a_daily_input
  - pitcher_non_k_out_pct_a_daily_input
  - batter_k_pct_daily_input
  - batter_bb_pct_daily_input
  - batter_hbp_pct_daily_input
  - batter_1b_pct_daily_input
  - batter_2b_pct_daily_input
  - batter_3b_pct_daily_input
  - batter_hr_pct_daily_input
  - batter_non_k_out_pct_daily_input
  - team_defense_oaa_input
  - park_factor_input


In [15]:
# Cell: Extract continuous features and create scaler
# Filter for complete records (no nulls in continuous columns)
complete_records = final_model_df.filter(
    pl.all_horizontal([pl.col(col).is_not_null() for col in CONTINUOUS_COLS])
)

print(f"Complete records: {complete_records.shape[0]} out of {final_model_df.shape[0]}")

# Extract continuous features
continuous_data = complete_records.select(CONTINUOUS_COLS).to_numpy()
print(f"Continuous data shape: {continuous_data.shape}")

Complete records: 186072 out of 186072
Continuous data shape: (186072, 18)


In [16]:
# Cell: Fit the scaler
scaler = StandardScaler()
scaler.fit(continuous_data)

print(f"✅ Scaler fitted on {scaler.n_features_in_} features")
print(f"Feature names: {CONTINUOUS_COLS}")
print(f"Scaler means: {scaler.mean_}")
print(f"Scaler scales: {scaler.scale_}")

✅ Scaler fitted on 18 features
Feature names: ['pitcher_k_pct_a_daily_input', 'pitcher_bb_pct_a_daily_input', 'pitcher_hbp_pct_a_daily_input', 'pitcher_1b_pct_a_daily_input', 'pitcher_2b_pct_a_daily_input', 'pitcher_3b_pct_a_daily_input', 'pitcher_hr_pct_a_daily_input', 'pitcher_non_k_out_pct_a_daily_input', 'batter_k_pct_daily_input', 'batter_bb_pct_daily_input', 'batter_hbp_pct_daily_input', 'batter_1b_pct_daily_input', 'batter_2b_pct_daily_input', 'batter_3b_pct_daily_input', 'batter_hr_pct_daily_input', 'batter_non_k_out_pct_daily_input', 'team_defense_oaa_input', 'park_factor_input']
Scaler means: [2.40808837e-01 8.34081787e-02 1.16119733e-02 1.39100213e-01
 4.30171619e-02 3.55229409e-03 3.01620115e-02 4.48339330e-01
 2.28338439e-01 8.28743676e-02 1.15886796e-02 1.41599251e-01
 4.39812902e-02 3.68337551e-03 3.18720145e-02 4.56062582e-01
 4.11058935e-03 9.99770895e+01]
Scaler scales: [4.27639109e-02 1.61286486e-02 2.90194866e-03 8.97054946e-03
 2.55876492e-03 6.45479383e-04 2.49247

In [None]:
# Cell: Validate the scaler
# sample_data = continuous_data.head(1000)
# scaled_sample = scaler.transform(sample_data)

# print("Validation:")
# print(f"Original sample mean: {sample_data.mean().values}")
# print(f"Scaled sample mean: {scaled_sample.mean(axis=0)}")
# print(f"Scaled sample std: {scaled_sample.std(axis=0)}")

# # Check that scaled data has mean ~0 and std ~1
# if np.allclose(scaled_sample.mean(axis=0), 0, atol=0.1) and np.allclose(scaled_sample.std(axis=0), 1, atol=0.1):
#     print("✅ Scaler validation passed!")
# else:
#     print("⚠️ Scaler validation warning")

Validation:
Original sample mean: [2.38299334e-01 8.36736782e-02 1.14551050e-02 1.39667682e-01
 4.29537275e-02 3.57531625e-03 3.02073240e-02 4.50167833e-01
 2.29849702e-01 8.30106512e-02 1.15709229e-02 1.41134388e-01
 4.39945933e-02 3.72153470e-03 3.25607672e-02 4.54157441e-01
 4.20261729e-03 1.00098000e+02]
Scaled sample mean: [-0.05868273  0.01646136 -0.05405619  0.0632591  -0.02479102  0.03566677
  0.01817975  0.04323643  0.02941644  0.00635108 -0.00313079 -0.02662414
  0.00469705  0.03584435  0.06361949 -0.03772059  0.006683    0.0363977 ]
Scaled sample std: [1.03258068 0.99640896 0.96160399 1.02337304 0.96412319 0.960351
 1.03449505 1.02433627 1.02887893 1.01133291 0.99312093 1.0195874
 1.03530477 1.0128513  1.02513215 1.03439185 0.96066283 0.95632846]
✅ Scaler validation passed!


In [17]:
# Cell: Save the scaler
output_path = f"../baseball_simulator/pa_outcome_scaler.joblib"
joblib.dump(scaler, output_path)
print(f"✅ Scaler saved to: {output_path}")

✅ Scaler saved to: ../baseball_simulator/pa_outcome_scaler.joblib
