In [5]:
# Create PA Outcome Scaler from Baseball Simulator Data
# This notebook recreates the pa_outcome_scaler.joblib file using your data processing pipeline

import logging
import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import StandardScaler
import joblib
from pathlib import Path

import sys
import pathlib

sys.path.append(str(pathlib.Path(pathlib.Path.cwd()).parent))

# Import your modules (adjust paths as needed)
import baseball_simulator.config as config
import baseball_simulator.data_processor as data_processor
import baseball_simulator.storage as storage
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("Starting PA Outcome Scaler Creation Process...")


Starting PA Outcome Scaler Creation Process...


In [6]:
# Step 1: Load the historical PA data with helpers
print("\n=== Step 1: Loading Historical PA Data ===")
try:
    # Load the processed historical data
    historical_pa_helpers_path = f"{config.BASE_FILE_PATH}historical_pa_data_with_helpers.parquet"
    df_historical = pl.read_parquet(historical_pa_helpers_path)
    print(f"Loaded {df_historical.shape[0]} historical PA records")
    print(f"Columns available: {df_historical.columns}")
except Exception as e:
    print(f"Error loading historical data: {e}")
    print("Please ensure the historical_pa_data_with_helpers.parquet file exists")
    raise


=== Step 1: Loading Historical PA Data ===
Loaded 6602 historical PA records
Columns available: ['game_pk', 'at_bat_number', 'pitch_number', 'batter', 'pitcher', 'events', 'stand', 'p_throws', 'inning_topbot', 'home_team', 'away_team', 'game_date', 'game_type', 'bb_type', 'balls', 'strikes', 'outs_when_up', 'inning', 'game_year', 'fielder_2', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', 'pa_outcome_category', 'is_pa', 'is_ab', 'is_hit', 'is_k', 'is_bb', 'is_hbp', 'is_1b', 'is_2b', 'is_3b', 'is_hr', 'is_out']


In [7]:
# Step 2: Process the data to get daily stats (like in your pipeline)
print("\n=== Step 2: Processing Data for Daily Stats ===")

# Calculate daily totals
print("Calculating batter daily totals...")
df_batter_daily = data_processor.calculate_batter_daily_totals(df_historical)

print("Calculating pitcher daily totals...")
df_pitcher_daily = data_processor.calculate_pitcher_daily_totals(df_historical)

# Calculate cumulative stats
print("Calculating cumulative batter stats...")
df_batter_daily = data_processor.calculate_cumulative_batter_stats(df_batter_daily)

print("Calculating cumulative pitcher stats...")
df_pitcher_daily = data_processor.calculate_cumulative_pitcher_stats(df_pitcher_daily)

# Apply ballast and calculate final rolling stats
print("Applying ballast and calculating final rolling stats...")
df_batter_daily_final = data_processor.calculate_ballasted_batter_stats(
    df_batter_daily,
    lg_avgs=config.LEAGUE_AVG_RATES,
    ballast_weights=config.BALLAST_WEIGHTS
)

df_pitcher_daily_final = data_processor.calculate_ballasted_pitcher_stats(
    df_pitcher_daily,
    lg_avgs=config.LEAGUE_AVG_RATES,
    ballast_weights=config.BALLAST_WEIGHTS
)

print(f"Processed batter stats shape: {df_batter_daily_final.shape}")
print(f"Processed pitcher stats shape: {df_pitcher_daily_final.shape}")


=== Step 2: Processing Data for Daily Stats ===
Calculating batter daily totals...
Calculating pitcher daily totals...
Calculating cumulative batter stats...
Calculating cumulative pitcher stats...
Applying ballast and calculating final rolling stats...
Processed batter stats shape: (1778, 31)
Processed pitcher stats shape: (754, 31)


In [8]:
# Step 3: Join the stats back to the main dataframe
print("\n=== Step 3: Joining Stats Back to Main DataFrame ===")

# Get relevant columns
batter_cols = data_processor.get_cols_to_join(df_batter_daily_final, "batter")
pitcher_cols = data_processor.get_cols_to_join(df_pitcher_daily_final, "pitcher")

print(f"Batter columns to join: {batter_cols}")
print(f"Pitcher columns to join: {pitcher_cols}")

# Filter dataframes to only include relevant columns
batter_stats_to_join = data_processor.select_subset_of_cols(
    df_batter_daily_final, "batter", batter_cols
)
pitcher_stats_to_join = data_processor.select_subset_of_cols(
    df_pitcher_daily_final, "pitcher", pitcher_cols
)

# Join everything together
print("Joining daily stats back to original dataframe...")
main_df = data_processor.join_together_final_df(
    df_historical, 
    batter_stats_to_join, 
    pitcher_stats_to_join
)

print(f"Final joined dataframe shape: {main_df.shape}")
print(f"Final columns: {main_df.columns}")


=== Step 3: Joining Stats Back to Main DataFrame ===
Batter columns to join: ['batter_avg_daily_input', 'batter_k_pct_daily_input', 'batter_bb_pct_daily_input', 'batter_hbp_pct_daily_input', 'batter_1b_pct_daily_input', 'batter_2b_pct_daily_input', 'batter_3b_pct_daily_input', 'batter_hr_pct_daily_input', 'batter_non_k_out_pct_daily_input']
Pitcher columns to join: ['pitcher_avg_a_daily_input', 'pitcher_k_pct_a_daily_input', 'pitcher_bb_pct_a_daily_input', 'pitcher_hbp_pct_a_daily_input', 'pitcher_1b_pct_a_daily_input', 'pitcher_2b_pct_a_daily_input', 'pitcher_3b_pct_a_daily_input', 'pitcher_hr_pct_a_daily_input', 'pitcher_non_k_out_pct_a_daily_input']
Joining daily stats back to original dataframe...
Final joined dataframe shape: (6602, 59)
Final columns: ['game_pk', 'at_bat_number', 'pitch_number', 'batter', 'pitcher', 'events', 'stand', 'p_throws', 'inning_topbot', 'home_team', 'away_team', 'game_date', 'game_type', 'bb_type', 'balls', 'strikes', 'outs_when_up', 'inning', 'game_yea

In [9]:
# Step 4: Load additional context data
print("\n=== Step 4: Loading Context Data ===")

# Load park factors and defensive stats
try:
    park_factors_df = storage.load_dataframe("park_factors.parquet")
    player_defense_df = storage.load_dataframe("defensive_stats.parquet")
    
    if park_factors_df is None or player_defense_df is None:
        print("Warning: Park factors or defensive stats not found. Using defaults.")
        # Create minimal context data
        park_factors_df = pl.DataFrame({
            "venue_id": [1],
            "year": [2024],
            "park_factor": [100.0]
        })
        player_defense_df = pl.DataFrame({
            "player_id": [1],
            "year": [2024],
            "cumulative_oaa_prior": [0.0]
        })
    
    print(f"Park factors shape: {park_factors_df.shape}")
    print(f"Defensive stats shape: {player_defense_df.shape}")
    
except Exception as e:
    print(f"Error loading context data: {e}")
    print("Creating default context data...")
    
    # Create minimal context data
    park_factors_df = pl.DataFrame({
        "venue_id": [1],
        "year": [2024], 
        "park_factor": [100.0]
    })
    player_defense_df = pl.DataFrame({
        "player_id": [1],
        "year": [2024],
        "cumulative_oaa_prior": [0.0]
    })

2025-06-03 10:30:26,670 - INFO - storage - Attempting to load DataFrame from: ..\clean_data\park_factors.parquet
2025-06-03 10:30:26,679 - INFO - storage - Successfully loaded DataFrame from ..\clean_data\park_factors.parquet
2025-06-03 10:30:26,680 - INFO - storage - Attempting to load DataFrame from: ..\clean_data\defensive_stats.parquet
2025-06-03 10:30:26,689 - INFO - storage - Successfully loaded DataFrame from ..\clean_data\defensive_stats.parquet



=== Step 4: Loading Context Data ===
Park factors shape: (245, 6)
Defensive stats shape: (1305, 7)


In [10]:
# Step 5: Extract features for scaling
print("\n=== Step 5: Extracting Features for Scaler ===")

# Filter for complete records (no nulls in predictor columns)
print("Filtering for complete records...")

# Check which predictor columns exist in the dataframe
available_predictors = [col for col in config.PREDICTOR_COLS if col in main_df.columns]
missing_predictors = [col for col in config.PREDICTOR_COLS if col not in main_df.columns]

print(f"Available predictor columns ({len(available_predictors)}): {available_predictors}")
if missing_predictors:
    print(f"Missing predictor columns ({len(missing_predictors)}): {missing_predictors}")

# Filter for non-null values in available predictors
complete_records = main_df.filter(
    pl.all_horizontal([pl.col(col).is_not_null() for col in available_predictors])
)

print(f"Complete records: {complete_records.shape[0]} out of {main_df.shape[0]}")

# Extract continuous features that need scaling
continuous_cols_available = [col for col in config.CONTINUOUS_COLS if col in complete_records.columns]
print(f"Available continuous columns for scaling: {continuous_cols_available}")

if not continuous_cols_available:
    raise ValueError("No continuous columns available for scaling!")

# Convert to pandas for sklearn compatibility
continuous_data = complete_records.select(continuous_cols_available).to_pandas()

print(f"Continuous data shape for scaling: {continuous_data.shape}")
print(f"Sample statistics:")
print(continuous_data.describe())


=== Step 5: Extracting Features for Scaler ===
Filtering for complete records...
Available predictor columns (18): ['is_platoon_adv', 'is_batter_home', 'pitcher_k_pct_a_daily_input', 'pitcher_bb_pct_a_daily_input', 'pitcher_hbp_pct_a_daily_input', 'pitcher_1b_pct_a_daily_input', 'pitcher_2b_pct_a_daily_input', 'pitcher_3b_pct_a_daily_input', 'pitcher_hr_pct_a_daily_input', 'pitcher_non_k_out_pct_a_daily_input', 'batter_k_pct_daily_input', 'batter_bb_pct_daily_input', 'batter_hbp_pct_daily_input', 'batter_1b_pct_daily_input', 'batter_2b_pct_daily_input', 'batter_3b_pct_daily_input', 'batter_hr_pct_daily_input', 'batter_non_k_out_pct_daily_input']
Missing predictor columns (2): ['team_defense_oaa_input', 'park_factor_input']
Complete records: 6602 out of 6602
Available continuous columns for scaling: ['pitcher_k_pct_a_daily_input', 'pitcher_bb_pct_a_daily_input', 'pitcher_hbp_pct_a_daily_input', 'pitcher_1b_pct_a_daily_input', 'pitcher_2b_pct_a_daily_input', 'pitcher_3b_pct_a_daily_inpu

In [11]:
# Step 6: Create and fit the scaler
print("\n=== Step 6: Creating and Fitting Scaler ===")

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the continuous features
print("Fitting scaler on continuous features...")
scaler.fit(continuous_data)

print(f"Scaler fitted on {scaler.n_features_in_} features")
print(f"Feature names: {continuous_cols_available}")
print(f"Scaler means: {scaler.mean_}")
print(f"Scaler scales: {scaler.scale_}")


=== Step 6: Creating and Fitting Scaler ===
Fitting scaler on continuous features...
Scaler fitted on 16 features
Feature names: ['pitcher_k_pct_a_daily_input', 'pitcher_bb_pct_a_daily_input', 'pitcher_hbp_pct_a_daily_input', 'pitcher_1b_pct_a_daily_input', 'pitcher_2b_pct_a_daily_input', 'pitcher_3b_pct_a_daily_input', 'pitcher_hr_pct_a_daily_input', 'pitcher_non_k_out_pct_a_daily_input', 'batter_k_pct_daily_input', 'batter_bb_pct_daily_input', 'batter_hbp_pct_daily_input', 'batter_1b_pct_daily_input', 'batter_2b_pct_daily_input', 'batter_3b_pct_daily_input', 'batter_hr_pct_daily_input', 'batter_non_k_out_pct_daily_input']
Scaler means: [0.22749002 0.0811467  0.01146754 0.14016253 0.04339129 0.00358652
 0.03077919 0.46197621 0.22596394 0.08090378 0.01146677 0.14038083
 0.04347932 0.00359581 0.03066114 0.4635484 ]
Scaler scales: [0.01957183 0.00519156 0.00059544 0.00179956 0.00048111 0.00015615
 0.00052788 0.01905089 0.01906724 0.00747773 0.00146211 0.00372492
 0.00043823 0.00013743 0

In [12]:
# Step 7: Validate the scaler
print("\n=== Step 7: Validating Scaler ===")

# Transform a sample to verify it works
sample_data = continuous_data.head(100)
scaled_sample = scaler.transform(sample_data)

print(f"Original sample mean: {sample_data.mean().values}")
print(f"Original sample std: {sample_data.std().values}")
print(f"Scaled sample mean: {scaled_sample.mean(axis=0)}")
print(f"Scaled sample std: {scaled_sample.std(axis=0)}")

# Check that scaled data has mean ~0 and std ~1
if np.allclose(scaled_sample.mean(axis=0), 0, atol=0.1) and np.allclose(scaled_sample.std(axis=0), 1, atol=0.1):
    print("✅ Scaler validation passed!")
else:
    print("⚠️ Scaler validation warning: scaled data doesn't have expected mean=0, std=1")


=== Step 7: Validating Scaler ===
Original sample mean: [0.22422898 0.08150506 0.01145745 0.14002373 0.04339922 0.00360153
 0.03084647 0.46493756 0.23124658 0.08164525 0.01152847 0.13988365
 0.04347314 0.00359445 0.03039332 0.45823515]
Original sample std: [0.01857172 0.00517772 0.00054216 0.00170021 0.00046254 0.0001743
 0.00050426 0.01827484 0.02033855 0.00935148 0.00151079 0.00330594
 0.0004698  0.00013451 0.00273265 0.02126235]
Scaled sample mean: [-0.16661926  0.06902758 -0.01695018 -0.07713022  0.01647445  0.09618859
  0.12746368  0.15544399  0.27705314  0.09915699  0.04219674 -0.13347466
 -0.01411788 -0.0099025  -0.0850315  -0.28316982]
Scaled sample std: [0.94414428 0.9923364  0.90594937 0.94005678 0.9565816  1.11065041
 0.95047889 0.95445625 1.06132847 1.24430849 1.02811679 0.88307174
 1.06665608 0.97383187 0.8632451  1.12749713]


In [None]:
# Step 8: Save the scaler
print("\n=== Step 8: Saving Scaler ===")

# Create output path
output_path = "baseball_simulator/pa_outcome_scaler.joblib"

# Save the scaler
joblib.dump(scaler, output_path)
print(f"✅ Scaler saved to: {output_path}")

In [None]:
# Step 9: Test loading the saved scaler
print("\n=== Step 9: Testing Saved Scaler ===")

# Load the scaler back and test it
loaded_scaler = joblib.load(output_path)
test_transform = loaded_scaler.transform(sample_data.head(5))

print(f"✅ Successfully loaded and tested scaler from {output_path}")
print(f"Test transform shape: {test_transform.shape}")

In [None]:
# Step 10: Summary
print("\n=== Summary ===")
print(f"✅ Successfully created pa_outcome_scaler.joblib")
print(f"📍 Location: {output_path}")
print(f"📊 Trained on {continuous_data.shape[0]} samples")
print(f"🔢 Features ({len(continuous_cols_available)}): {continuous_cols_available}")
print(f"📈 Feature means: {scaler.mean_.round(4)}")
print(f"📉 Feature scales: {scaler.scale_.round(4)}")

print("\n🎉 Scaler creation process completed successfully!")