In [2]:
import pymc as pm
import arviz as az
import numpy as np
import polars as pl # Or pandas
import joblib
import random
import pytensor.tensor as pt # For softmax if calculating manually
import numpy as np
import random
import polars as pl # Assuming you might use Polars Series sometimes
import pandas as pd # For DataFrame creation in helper
from sklearn.preprocessing import StandardScaler # Needed for helper
import joblib # Needed for helper
import arviz as az # Needed for helper


from google.colab import drive
drive.mount('/content/drive')

pl.Config.set_tbl_cols(200)
pl.Config.set_tbl_rows(200)

FILE_PATH = '/content/drive/My Drive/Betting Models/mlb/hits_model/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Model & Scaler

In [3]:
# Load InferenceData (replace with your path)
idata = az.from_netcdf(f"{FILE_PATH}multi_outcome_model.nc")

# Load the scaler used during training (replace with your path)
scaler_filename = f"{FILE_PATH}pa_outcome_scaler.joblib"
scaler = joblib.load(scaler_filename)

# Define outcome category mapping (same as used in training)
outcome_labels = {
    0: "Out_In_Play", 1: "Single", 2: "Double", 3: "Triple", 4: "HomeRun",
    5: "Strikeout", 6: "Walk", 7: "HBP"
    # Add/modify based on your actual categories
}
n_categories = len(outcome_labels) # Should match model dimension


# List ALL predictor columns you calculated and joined
# Make sure these names exactly match your DataFrame columns
predictor_cols = [
    'is_platoon_adv',
    'is_batter_home',
    # Pitcher Stats
    # 'pitcher_avg_a_daily_input',
    'pitcher_k_pct_a_daily_input',
    'pitcher_bb_pct_a_daily_input',
    'pitcher_hbp_pct_a_daily_input',
    'pitcher_1b_pct_a_daily_input',
    'pitcher_2b_pct_a_daily_input',
    'pitcher_3b_pct_a_daily_input',
    'pitcher_hr_pct_a_daily_input',
    'pitcher_non_k_out_pct_a_daily_input',
    # Add other pitcher rate inputs here (HBP%, 1B%, 2B%, 3B%, HR%) if calculated
    # Batter Stats
    # 'batter_avg_daily_input',
    'batter_k_pct_daily_input',
    'batter_bb_pct_daily_input',
    'batter_hbp_pct_daily_input',
    'batter_1b_pct_daily_input',
    'batter_2b_pct_daily_input',
    'batter_3b_pct_daily_input',
    'batter_hr_pct_daily_input',
    'batter_non_k_out_pct_daily_input',
    # Add other batter rate inputs here (HBP%, 1B%, 2B%, 3B%, HR%) if calculated
    # Context Stats
    'team_defense_oaa_input',
    'park_factor_input',
]

# Identify continuous columns needing scaling vs categorical (like platoon)
continuous_cols = [
    # 'pitcher_avg_a_daily_input',
    'pitcher_k_pct_a_daily_input',
    'pitcher_bb_pct_a_daily_input',
    'pitcher_hbp_pct_a_daily_input',
    'pitcher_1b_pct_a_daily_input',
    'pitcher_2b_pct_a_daily_input',
    'pitcher_3b_pct_a_daily_input',
    'pitcher_hr_pct_a_daily_input',
    'pitcher_non_k_out_pct_a_daily_input',
    # 'batter_avg_daily_input',
    'batter_k_pct_daily_input',
    'batter_bb_pct_daily_input',
    'batter_hbp_pct_daily_input',
    'batter_1b_pct_daily_input',
    'batter_2b_pct_daily_input',
    'batter_3b_pct_daily_input',
    'batter_hr_pct_daily_input',
    'batter_non_k_out_pct_daily_input',
    'team_defense_oaa_input',
    'park_factor_input',
    # Add other continuous rate inputs here
]
categorical_cols = ['is_platoon_adv', 'is_batter_home']

## Create Function to Predict Probabilities

In [None]:
# --- Helper Function: Predict PA Outcome Probabilities (Using Mean Posterior) ---
# (Copied from previous response - ensure this is available)
def predict_pa_outcome_probs(pa_inputs_dict, idata, scaler, predictor_cols, continuous_cols, categorical_cols, n_categories):
    """
    Predicts outcome probabilities for a single PA using MEAN model posterior parameters.
    # ... (rest of the function code as provided in the previous response) ...
    """
    # 1. Prepare Input Data into correct order
    try:
        input_list = [pa_inputs_dict[col] for col in predictor_cols]
        input_array = np.array(input_list).reshape(1, -1) # Reshape to 2D array (1 row)
    except KeyError as e:
        print(f"Error: Missing key {e} in pa_inputs_dict")
        print(f"Required keys: {predictor_cols}")
        raise

    # Use Pandas for easier column selection based on names for scaling
    # Ensure predictor_cols contains only the feature columns
    feature_df = pd.DataFrame(input_array, columns=predictor_cols)

    # 2. Scale Continuous Features
    # Ensure continuous_cols only contains columns present in feature_df
    valid_continuous_cols = [col for col in continuous_cols if col in feature_df.columns]
    continuous_data = feature_df[valid_continuous_cols].values
    try:
        scaled_continuous_data = scaler.transform(continuous_data) # Use transform, NOT fit_transform
    except Exception as e:
        print(f"Error scaling data: {e}")
        print(f"Input data shape for scaling: {continuous_data.shape}")
        print(f"Scaler expects {scaler.n_features_in_} features.")
        # Make sure continuous_cols list matches scaler's fitted columns IN ORDER
        raise

    # 3. Combine Features
    # Ensure categorical_cols only contains columns present in feature_df
    valid_categorical_cols = [col for col in categorical_cols if col in feature_df.columns]
    categorical_data = feature_df[valid_categorical_cols].values
    try:
      X_new = np.concatenate([scaled_continuous_data, categorical_data], axis=1)
    except ValueError as e:
        print(f"Error concatenating features: {e}")
        print(f"Scaled continuous shape: {scaled_continuous_data.shape}")
        print(f"Categorical shape: {categorical_data.shape}")
        raise

    n_predictors = X_new.shape[1] # Get number of predictors from the combined array

    # 4. Use Mean Posterior Parameters
    try:
        # Ensure 'intercepts' and 'betas' exist and have correct structure after potential reshaping
        # This assumes intercepts/betas in idata already account for the reference category
        mean_intercepts = idata.posterior["intercepts"].mean(dim=("chain", "draw")).values
        mean_betas = idata.posterior["betas"].mean(dim=("chain", "draw")).values
    except Exception as e:
        print(f"Error accessing posterior samples in idata: {e}")
        print("Make sure 'intercepts' and 'betas' were saved correctly in idata.")
        raise

    # Check shapes and adjust if necessary (if idata stored offsets)
    expected_intercept_shape = (n_categories,)
    expected_beta_shape = (n_predictors, n_categories)

    # --- This logic assumes idata stored the FULL parameters ---
    # --- If idata stored offsets, you'd reconstruct here as done previously ---
    if mean_intercepts.shape != expected_intercept_shape:
         raise ValueError(f"Mean intercepts shape mismatch. Expected {expected_intercept_shape}, got {mean_intercepts.shape}. Reconstruction might be needed.")
    if mean_betas.shape != expected_beta_shape:
         raise ValueError(f"Mean betas shape mismatch. Expected {expected_beta_shape}, got {mean_betas.shape}. Reconstruction might be needed.")

    # 5. Calculate Linear Predictor (mu)
    mu_mean = mean_intercepts + X_new @ mean_betas # Shape (1, n_categories)

    # 6. Apply Softmax
    exp_mu_mean = np.exp(mu_mean - np.max(mu_mean, axis=1, keepdims=True))
    p_vector_mean = exp_mu_mean / np.sum(exp_mu_mean, axis=1, keepdims=True)

    # Ensure probabilities sum roughly to 1
    if not np.isclose(np.sum(p_vector_mean), 1.0):
        print(f"Warning: Probabilities do not sum to 1: {np.sum(p_vector_mean)}")
        p_vector_mean = p_vector_mean / np.sum(p_vector_mean) # Normalize

    return p_vector_mean.flatten()






In [4]:
# --- Helper Function: Predict PA Outcome Probabilities (Using Mean Posterior) ---
# Needs idata, scaler, predictor_cols, etc., defined in the main scope or passed
# This version uses the mean of the posterior parameters for simplicity

def predict_pa_outcome_probs(pa_inputs_dict, idata, scaler, predictor_cols, continuous_cols, categorical_cols, n_categories):
    """
    Predicts outcome probabilities for a single PA using MEAN model posterior parameters.

    Args:
        pa_inputs_dict (dict): Dict with predictor names as keys and single values.
        idata (arviz.InferenceData): Fitted model inference data.
        scaler (sklearn.preprocessing.StandardScaler): Fitted scaler object.
        predictor_cols (list): List of all predictor column names in order.
        continuous_cols (list): List of continuous predictor columns for scaling.
        categorical_cols (list): List of categorical/binary columns.
        n_categories (int): Number of outcome categories.

    Returns:
        np.ndarray: A probability vector (sums to 1) for the PA outcomes. Shape: (n_categories,)
    """
    # 1. Prepare Input Data into correct order
    try:
        input_list = [pa_inputs_dict[col] for col in predictor_cols]
        input_array = np.array(input_list).reshape(1, -1) # Reshape to 2D array (1 row)
    except KeyError as e:
        print(f"Error: Missing key {e} in pa_inputs_dict")
        print(f"Required keys: {predictor_cols}")
        raise

    # Use Pandas for easier column selection based on names for scaling
    input_df = pd.DataFrame(input_array, columns=predictor_cols)

    # 2. Scale Continuous Features
    continuous_data = input_df[continuous_cols].values
    try:
        scaled_continuous_data = scaler.transform(continuous_data) # Use transform, NOT fit_transform
    except Exception as e:
        print(f"Error scaling data: {e}")
        print(f"Input data shape for scaling: {continuous_data.shape}")
        print(f"Scaler expects {scaler.n_features_in_} features.")
        raise

    # 3. Combine Features
    categorical_data = input_df[categorical_cols].values
    X_new = np.concatenate([scaled_continuous_data, categorical_data], axis=1)
    n_predictors = X_new.shape[1]


    # 4. Use Mean Posterior Parameters
    try:
        mean_intercepts = idata.posterior["intercepts"].mean(dim=("chain", "draw")).values
        mean_betas = idata.posterior["betas"].mean(dim=("chain", "draw")).values
    except Exception as e:
        print(f"Error accessing posterior samples in idata: {e}")
        print("Make sure 'intercepts' and 'betas' were saved correctly.")
        raise

    # Check shapes
    expected_intercept_shape = (n_categories,)
    expected_beta_shape = (n_predictors, n_categories)

    if mean_intercepts.shape != expected_intercept_shape:
         print(f"Warning: Mean intercepts shape mismatch. Expected {expected_intercept_shape}, got {mean_intercepts.shape}")
         # Attempt to reshape or handle based on how reference category was added
         # This assumes reference category (all zeros) was added as the last one
         if mean_intercepts.shape == (n_categories - 1,):
              mean_intercepts = np.concatenate([mean_intercepts, [0.0]])
         else:
              raise ValueError("Cannot resolve intercept shape mismatch.")


    if mean_betas.shape != expected_beta_shape:
        print(f"Warning: Mean betas shape mismatch. Expected {expected_beta_shape}, got {mean_betas.shape}")
        # Attempt to reshape or handle based on how reference category was added
        if mean_betas.shape == (n_predictors, n_categories - 1):
             ref_betas = np.zeros((n_predictors, 1))
             mean_betas = np.concatenate([mean_betas, ref_betas], axis=1)
        else:
             raise ValueError("Cannot resolve beta shape mismatch.")


    # 5. Calculate Linear Predictor (mu)
    mu_mean = mean_intercepts + X_new @ mean_betas # Shape (1, n_categories)

    # 6. Apply Softmax (manual implementation for stability)
    exp_mu_mean = np.exp(mu_mean - np.max(mu_mean, axis=1, keepdims=True))
    p_vector_mean = exp_mu_mean / np.sum(exp_mu_mean, axis=1, keepdims=True)

    # Ensure probabilities sum roughly to 1
    if not np.isclose(np.sum(p_vector_mean), 1.0):
        print(f"Warning: Probabilities do not sum to 1: {np.sum(p_vector_mean)}")
        # Normalize as fallback
        p_vector_mean = p_vector_mean / np.sum(p_vector_mean)


    return p_vector_mean.flatten() # Return the single probability vector

## Create Function to Simulate One Inning

In [5]:
# --- Core Inning Simulation Function (Includes GIDP etc.) ---
# (Copied and refined from previous response)
def simulate_single_inning(inning_num, is_top_inning, lineup, start_batter_idx,
                           pitcher_inputs, game_context, # game_context has park, BOTH team defenses
                           idata, scaler, outcome_labels,
                           predictor_cols, continuous_cols, categorical_cols,
                           n_categories, league_avg_rates):
    """Simulates a single half-inning with realistic base running."""
    outs = 0
    hits = 0
    runs = 0
    walks = 0 # Added walk tracking
    bases = [0, 0, 0] # 0=empty, 1=runner present
    current_batter_idx = start_batter_idx
    lineup_len = len(lineup)

    # Determine fielding team's defense rating from game_context
    if is_top_inning: # Away team batting, Home team fielding
        fielding_team_defense_rating = game_context['home_team_defense_rating']
        is_batter_home = 0
    else: # Home team batting, Away team fielding
        fielding_team_defense_rating = game_context['away_team_defense_rating']
        is_batter_home = 1

    inning_context_pa = {
        'park_factor_input': game_context['park_factor_input'],
        'team_defense_oaa_input': fielding_team_defense_rating,
        'is_batter_home': is_batter_home
    }

    while outs < 3:
        batter_spot_in_lineup = current_batter_idx % lineup_len
        current_batter_inputs = lineup[batter_spot_in_lineup]

        try:
            batter_stand = current_batter_inputs['stand']
            pitcher_throws = pitcher_inputs['p_throws']
            is_platoon = 1 if (batter_stand == 'L' and pitcher_throws == 'R') or \
                              (batter_stand == 'R' and pitcher_throws == 'L') else 0
        except KeyError as e:
             print(f"Warning: Missing 'stand' or 'p_throws' in inputs: {e}. Assuming no platoon advantage.")
             is_platoon = 0

        pa_inputs = {
            **current_batter_inputs,
            **pitcher_inputs,
            **inning_context_pa,
            'is_platoon_adv': is_platoon
        }
        # Remove non-predictor keys if they exist from batter/pitcher inputs
        pa_inputs = {k: v for k, v in pa_inputs.items() if k in predictor_cols or k in ['stand', 'p_throws']}


        outcome_probs = predict_pa_outcome_probs(
            pa_inputs, idata, scaler, predictor_cols,
            continuous_cols, categorical_cols, n_categories
        )

        possible_outcomes = list(outcome_labels.keys())
        simulated_outcome_code = np.random.choice(possible_outcomes, p=outcome_probs)
        outcome_label = outcome_labels[simulated_outcome_code]

        new_bases = list(bases)
        runs_this_pa = 0
        pa_hit = 0
        pa_walk = 0

        # Store outs *before* this PA is resolved for GIDP check
        outs_before_pa = outs

        if outcome_label == "Strikeout":
            outs += 1
        elif outcome_label == "Walk":
            pa_walk += 1 # Track walk
            # Force runner advancement logic (simplified)
            if bases[0] == 1:
                if bases[1] == 1:
                    if bases[2] == 1: runs_this_pa += 1
                    new_bases[2] = 1
                new_bases[1] = 1
            new_bases[0] = 1
        elif outcome_label == "HBP":
             # Force runner advancement logic (same as walk)
            if bases[0] == 1:
                if bases[1] == 1:
                    if bases[2] == 1: runs_this_pa += 1
                    new_bases[2] = 1
                new_bases[1] = 1
            new_bases[0] = 1
        elif outcome_label == "Single":
            pa_hit += 1
            # Advance runners (simplified rules + probabilistic 1st->3rd)
            runner_3b_scores = (bases[2] == 1)
            runner_2b_scores = (bases[1] == 1) # Assume scores from 2nd
            runner_1b_to_3rd = False
            runner_1b_to_2nd = False

            if runner_3b_scores: runs_this_pa += 1
            if runner_2b_scores: runs_this_pa += 1

            if bases[0] == 1:
                if random.random() < league_avg_rates["rate_1st_to_3rd_on_single"]:
                    runner_1b_to_3rd = True
                else:
                    runner_1b_to_2nd = True

            # Place runners
            new_bases = [0, 0, 0]
            if runner_1b_to_3rd: new_bases[2] = 1
            elif runner_from_2b_scores == False and bases[1] == 1: new_bases[2] = 1 # R2 holds 3rd if didn't score
            if runner_1b_to_2nd: new_bases[1] = 1
            new_bases[0] = 1 # Batter to 1st

        elif outcome_label == "Double":
            pa_hit += 1
            runner_3b_scores = (bases[2] == 1)
            runner_2b_scores = (bases[1] == 1)
            runner_1b_scores = False
            runner_1b_to_3rd = False

            if runner_3b_scores: runs_this_pa += 1
            if runner_2b_scores: runs_this_pa += 1
            if bases[0] == 1:
                if random.random() < league_avg_rates["rate_score_from_1st_on_double"]:
                    runner_1b_scores = True
                    runs_this_pa += 1
                else:
                    runner_1b_to_3rd = True

            new_bases = [0, 0, 0]
            new_bases[1] = 1 # Batter to 2nd
            if runner_1b_to_3rd: new_bases[2] = 1

        elif outcome_label == "Triple":
            pa_hit += 1
            runs_this_pa += sum(bases) # All runners score
            new_bases = [0, 0, 1] # Batter to 3rd

        elif outcome_label == "HomeRun":
            pa_hit += 1
            runs_this_pa += 1 + sum(bases)
            new_bases = [0, 0, 0]

        elif outcome_label == "Out_In_Play":
            outs += 1
            # Check GIDP opportunity (runner on 1st, less than 2 outs *before* this PA)
            is_gidp_opportunity = (bases[0] == 1 and outs_before_pa < 2)
            # Use adjusted rate directly, as bb_type isn't predicted
            adjusted_gidp_rate = league_avg_rates.get("gidp_effective_rate", 0.065) # Use pre-calculated effective rate

            if is_gidp_opportunity and random.random() < adjusted_gidp_rate:
                if outs < 3: # Ensure DP doesn't add 4th out
                   outs += 1 # It's a double play
                # Simplified GIDP: batter out, runner forced at 2nd is out, others hold/advance if forced by other runners
                runner_3b_holds = (bases[2] == 1)
                runner_2b_to_3rd = (bases[1] == 1)
                new_bases = [0,0,0] # Batter out, runner from 1st out at 2nd
                if runner_2b_to_3rd : new_bases[2] = 1 # Runner from 2nd takes 3rd
                if runner_3b_holds and not runner_2b_to_3rd : new_bases[2] = 1 # Runner from 3rd holds if not pushed

            else: # Not a GIDP
                # Batter is out, advance runners if forced (simplified: 1 base)
                runner_3b_holds = (bases[2] == 1)
                runner_2b_to_3rd = (bases[1] == 1)
                runner_1b_to_2nd = (bases[0] == 1)
                new_bases = [0,0,0] # Batter out
                if runner_1b_to_2nd : new_bases[1] = 1
                if runner_2b_to_3rd : new_bases[2] = 1
                if runner_3b_holds and not runner_2b_to_3rd : new_bases[2] = 1

        # Update inning totals and base state
        runs += runs_this_pa
        hits += pa_hit
        walks += pa_walk
        bases = new_bases

        # --- Optional: Secondary Events Logic here ---

        # Move to next batter for next loop iteration
        current_batter_idx += 1

    # Inning Over
    return hits, runs, walks, (current_batter_idx % lineup_len)

In [3]:
# --- Function to Simulate First 3 Innings ---
def simulate_first_three_innings(
    home_lineup, away_lineup, home_pitcher_inputs, away_pitcher_inputs,
    game_context, # Dict with park_factor_input, home_team_defense_rating, away_team_defense_rating
    idata, scaler, outcome_labels, predictor_cols, continuous_cols,
    categorical_cols, n_categories, league_avg_rates):
    """
    Simulates the first 3 innings of a game.

    Returns:
        dict: Results containing hits, runs, walks per team per inning.
              Example: {'inning_1': {'away': {'H':1,'R':0,'BB':0}, 'home': {'H':0,'R':0,'BB':1}}, ...}
    """

    results = {}
    away_batter_idx = 0
    home_batter_idx = 0

    for inning in range(1, 4): # Innings 1, 2, 3
        inning_results = {'away': {}, 'home': {}}

        # --- Top of Inning ---
        inning_context_top = {
            'park_factor_input': game_context['park_factor_input'],
            'team_defense_oaa_input': game_context['home_team_defense_rating'], # Home team defends
            'is_batter_home': 0
        }
        away_hits, away_runs, away_walks, away_batter_idx_next = simulate_single_inning(
            inning, True, away_lineup, away_batter_idx, home_pitcher_inputs,
            inning_context_top, idata, scaler, outcome_labels,
            predictor_cols, continuous_cols, categorical_cols, n_categories, league_avg_rates
        )
        inning_results['away'] = {'H': away_hits, 'R': away_runs, 'BB': away_walks}
        away_batter_idx = away_batter_idx_next # Update for next away inning

        # --- Bottom of Inning ---
        inning_context_bot = {
            'park_factor_input': game_context['park_factor_input'],
            'team_defense_oaa_input': game_context['away_team_defense_rating'], # Away team defends
            'is_batter_home': 1
        }
        home_hits, home_runs, home_walks, home_batter_idx_next = simulate_single_inning(
            inning, False, home_lineup, home_batter_idx, away_pitcher_inputs,
            inning_context_bot, idata, scaler, outcome_labels,
            predictor_cols, continuous_cols, categorical_cols, n_categories, league_avg_rates
        )
        inning_results['home'] = {'H': home_hits, 'R': home_runs, 'BB': home_walks}
        home_batter_idx = home_batter_idx_next # Update for next home inning

        results[f'inning_{inning}'] = inning_results

    return results

SyntaxError: ':' expected after dictionary key (<ipython-input-3-ee256b6228c6>, line 6)

## Run Simulations

In [None]:
#--- Run Multiple Simulations ---
num_total_simulations = 10000
all_results = []
print(f"Running {num_total_simulations} game simulations...")
for _ in range(num_total_simulations):
    sim_result = simulate_first_three_innings(
        home_lineup, away_lineup, home_pitcher_inputs, away_pitcher_inputs,
        game_context, idata, scaler, outcome_labels, predictor_cols, continuous_cols,
        categorical_cols, n_categories, league_avg_rates
    )
    all_results.append(sim_result)

In [6]:
import polars as pl
import collections # Needed for example data generation
import random      # Needed for example data generation
import json        # For printing example data

# --- Assume final_probabilities dictionary exists ---
# (Generated from the analysis code in the previous step that tallied simulation runs)
# Example structure:
final_probabilities = {
  "inning_1": {
    "away": {
      "H": {"0": 0.60, "1": 0.25, "2": 0.10, "3": 0.03, "4": 0.01, "5+": 0.01},
      "BB": {"0": 0.70, "1": 0.20, "2": 0.05, "3": 0.02, "4": 0.01, "5+": 0.02},
      "HR": {"0": 0.90, "1": 0.07, "2": 0.02, "3": 0.01, "4": 0.00, "5+": 0.00}
    },
    "home": { # Populate example data for home team and other innings
      "H": {"0": 0.55, "1": 0.28, "2": 0.12, "3": 0.03, "4": 0.01, "5+": 0.01},
      "BB": {"0": 0.65, "1": 0.22, "2": 0.07, "3": 0.03, "4": 0.02, "5+": 0.01},
      "HR": {"0": 0.88, "1": 0.08, "2": 0.02, "3": 0.01, "4": 0.01, "5+": 0.00}
    }
  },
  "inning_2": { # Populate example data
      "away": {
          "H": {"0": 0.62, "1": 0.24, "2": 0.09, "3": 0.03, "4": 0.01, "5+": 0.01},
          "BB": {"0": 0.72, "1": 0.19, "2": 0.05, "3": 0.02, "4": 0.01, "5+": 0.01},
          "HR": {"0": 0.91, "1": 0.06, "2": 0.02, "3": 0.01, "4": 0.00, "5+": 0.00}
        },
      "home": {
          "H": {"0": 0.57, "1": 0.27, "2": 0.11, "3": 0.03, "4": 0.01, "5+": 0.01},
          "BB": {"0": 0.67, "1": 0.21, "2": 0.07, "3": 0.03, "4": 0.01, "5+": 0.01},
          "HR": {"0": 0.89, "1": 0.07, "2": 0.02, "3": 0.01, "4": 0.01, "5+": 0.00}
        }
  },
   "inning_3": { # Populate example data
      "away": {
          "H": {"0": 0.61, "1": 0.26, "2": 0.08, "3": 0.03, "4": 0.01, "5+": 0.01},
          "BB": {"0": 0.71, "1": 0.20, "2": 0.05, "3": 0.02, "4": 0.01, "5+": 0.01},
          "HR": {"0": 0.92, "1": 0.05, "2": 0.02, "3": 0.01, "4": 0.00, "5+": 0.00}
        },
      "home": {
          "H": {"0": 0.56, "1": 0.29, "2": 0.10, "3": 0.03, "4": 0.01, "5+": 0.01},
          "BB": {"0": 0.66, "1": 0.23, "2": 0.06, "3": 0.03, "4": 0.01, "5+": 0.01},
          "HR": {"0": 0.90, "1": 0.07, "2": 0.02, "3": 0.01, "4": 0.00, "5+": 0.00}
        }
   }
}
# --- End Example Data ---


# --- Convert nested dictionary to a list of records ---
data_for_df = []
for inn_str, teams_data in final_probabilities.items():
    # Extract inning number from the key 'inning_X'
    try:
        inning_num = int(inn_str.split('_')[1])
    except (IndexError, ValueError):
        print(f"Warning: Could not parse inning number from key '{inn_str}'. Skipping.")
        continue

    for team, stats_data in teams_data.items(): # team is 'away' or 'home'
        for stat, bins_data in stats_data.items(): # stat is 'H', 'BB', or 'HR'
            for number_bin, probability in bins_data.items(): # number_bin is '0', '1', ..., '5+'
                data_for_df.append({
                    "inning": inning_num,
                    "team": team,
                    "stat": stat,
                    "number": number_bin, # Keep the bin label ('0', '1', ..., '5+')
                    "probability": probability
                })

# --- Create Polars DataFrame ---
# Define schema for clarity and correct types
schema = {
    "inning": pl.Int64,
    "team": pl.Categorical, # Use categorical for efficiency
    "stat": pl.Categorical, # Use categorical for efficiency
    "number": pl.Utf8, # Keep bin labels as strings ('0', '1', ..., '5+')
    "probability": pl.Float64
}

if data_for_df: # Check if list is not empty
    df_probabilities = pl.DataFrame(data_for_df, schema=schema)

    print("\nFinal Probability DataFrame:")
    # Display sorted for readability
    print(df_probabilities.sort(["inning", "team", "stat", "number"]))
else:
    print("\nNo data processed to create the DataFrame.")


Final Probability DataFrame:
shape: (108, 5)
┌────────┬──────┬──────┬────────┬─────────────┐
│ inning ┆ team ┆ stat ┆ number ┆ probability │
│ ---    ┆ ---  ┆ ---  ┆ ---    ┆ ---         │
│ i64    ┆ cat  ┆ cat  ┆ str    ┆ f64         │
╞════════╪══════╪══════╪════════╪═════════════╡
│ 1      ┆ away ┆ H    ┆ 0      ┆ 0.6         │
│ 1      ┆ away ┆ H    ┆ 1      ┆ 0.25        │
│ 1      ┆ away ┆ H    ┆ 2      ┆ 0.1         │
│ 1      ┆ away ┆ H    ┆ 3      ┆ 0.03        │
│ 1      ┆ away ┆ H    ┆ 4      ┆ 0.01        │
│ 1      ┆ away ┆ H    ┆ 5+     ┆ 0.01        │
│ 1      ┆ away ┆ BB   ┆ 0      ┆ 0.7         │
│ 1      ┆ away ┆ BB   ┆ 1      ┆ 0.2         │
│ 1      ┆ away ┆ BB   ┆ 2      ┆ 0.05        │
│ 1      ┆ away ┆ BB   ┆ 3      ┆ 0.02        │
│ 1      ┆ away ┆ BB   ┆ 4      ┆ 0.01        │
│ 1      ┆ away ┆ BB   ┆ 5+     ┆ 0.02        │
│ 1      ┆ away ┆ HR   ┆ 0      ┆ 0.9         │
│ 1      ┆ away ┆ HR   ┆ 1      ┆ 0.07        │
│ 1      ┆ away ┆ HR   ┆ 2      ┆ 0.02    

## Analyze Results