In [2]:
import pymc as pm
import numpy as np
import arviz as az
import pandas as pd
import pytensor.tensor as pt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('master_df.csv')

def american_to_implied_prob(odds):
    """Convert American odds to implied probability (with vig)."""
    return 100 / (odds + 100) if odds > 0 else -odds / (-odds + 100)

def add_fair_implied_probs(df):
    odds_columns = [
        'betmgm_opening_homeOdds', 'betmgm_opening_awayOdds',
        'fanduel_opening_homeOdds', 'fanduel_opening_awayOdds',
        'caesars_opening_homeOdds', 'caesars_opening_awayOdds',
        'bet365_opening_homeOdds', 'bet365_opening_awayOdds',
        'draftkings_opening_homeOdds', 'draftkings_opening_awayOdds'
    ]
    
    for i in range(0, len(odds_columns), 2):
        home_col = odds_columns[i]
        away_col = odds_columns[i+1]

        home_prob = df[home_col].apply(american_to_implied_prob)
        away_prob = df[away_col].apply(american_to_implied_prob)

        total_prob = home_prob + away_prob

        # Use .loc to avoid SettingWithCopyWarning
        df.loc[:, home_col.replace('Odds', 'FairProb')] = home_prob / total_prob
        df.loc[:, away_col.replace('Odds', 'FairProb')] = away_prob / total_prob

    return df

odds_df = add_fair_implied_probs(df)

def create_beta_prior(df):
    fair_prob_cols = [
        'betmgm_opening_homeFairProb',
        'fanduel_opening_homeFairProb',
        'caesars_opening_homeFairProb',
        'bet365_opening_homeFairProb',
        'draftkings_opening_homeFairProb'
    ]

    # Alpha: sum of home fair probs
    df['beta_alpha'] = df[fair_prob_cols].sum(axis=1)
    # Beta: sum of (1 - home fair probs)
    df['beta_beta'] = len(fair_prob_cols) - df['beta_alpha']

    # Optional: posterior mean for reference
    df['beta_mean'] = df['beta_alpha'] / (df['beta_alpha'] + df['beta_beta'])

    return df

beta_priors = create_beta_prior(odds_df)
beta_priors

Unnamed: 0,gameId,startDate,homeTeam,awayTeam,homePitcher,awayPitcher,homeScore,awayScore,betmgm_opening_homeOdds,betmgm_opening_awayOdds,...,fanduel_opening_awayFairProb,caesars_opening_homeFairProb,caesars_opening_awayFairProb,bet365_opening_homeFairProb,bet365_opening_awayFairProb,draftkings_opening_homeFairProb,draftkings_opening_awayFairProb,beta_alpha,beta_beta,beta_mean
0,259014,2022-05-21T00:10:00+00:00,Milwaukee,Washington,Eric Lauer,Erick Fedde,7,0,-190.0,155.0,...,0.369898,0.625202,0.374798,0.637105,0.362895,0.611511,0.388489,3.129484,1.870516,0.625897
1,259007,2022-05-21T00:10:00+00:00,Houston,Texas,Cristian Javier,Martin Perez,0,3,-185.0,150.0,...,0.369898,0.596849,0.403151,0.611511,0.388489,0.596273,0.403727,3.053464,1.946536,0.610693
2,259026,2022-05-21T17:05:00+00:00,NY Yankees,Chi. White Sox,Nestor Cortes,Dallas Keuchel,7,5,-225.0,180.0,...,0.338696,0.666598,0.333402,0.658120,0.341880,0.588212,0.411788,3.233919,1.766081,0.646784
3,259023,2022-05-21T19:07:00+00:00,Toronto,Cincinnati,Alek Manoah,Hunter Greene,3,1,-250.0,200.0,...,0.314607,0.693737,0.306263,0.694656,0.305344,0.658120,0.341880,3.413725,1.586275,0.682745
4,265633,2022-05-21T19:10:00+00:00,Colorado,NY Mets,German Marquez,Carlos Carrasco,1,5,-105.0,-115.0,...,0.515397,0.467532,0.532468,0.467532,0.532468,0.467532,0.532468,2.376366,2.623634,0.475273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346,301021,2024-08-21T00:10:00+00:00,Kansas City,LA Angels,Cole Ragans,Tyler Anderson,5,9,-210.0,170.0,...,0.369898,0.634351,0.365649,0.630102,0.369898,0.644729,0.355271,3.185807,1.814193,0.637161
2347,301022,2024-08-21T00:10:00+00:00,Houston,Boston,Ronel Blanco,Nick Pivetta,5,6,-135.0,110.0,...,0.463039,0.542744,0.457256,0.542744,0.457256,0.542744,0.457256,2.711961,2.288039,0.542392
2348,301034,2024-08-21T01:40:00+00:00,San Diego,Minnesota,Martin Perez,Bailey Ober,7,5,100.0,-120.0,...,0.508909,0.478261,0.521739,0.478261,0.521739,0.473913,0.526087,2.399787,2.600213,0.479957
2349,301025,2024-08-21T01:45:00+00:00,San Francisco,Chi. White Sox,Robbie Ray,Davis Martin,4,1,-250.0,200.0,...,0.299838,0.671390,0.328610,0.669007,0.330993,0.675031,0.324969,3.397409,1.602591,0.679482


In [9]:
import pandas as pd
import numpy as np
import pymc as pm
import arviz as az
from sklearn.model_selection import train_test_split

# ------------------------------
# Data Preparation
# ------------------------------
df = beta_priors  # Assume this is already loaded

# Ensure numeric types
df['alpha'] = pd.to_numeric(df['alpha'], errors='coerce')
df['beta'] = pd.to_numeric(df['beta'], errors='coerce')
df['gamma'] = pd.to_numeric(df['gamma'], errors='coerce')
df['X'] = pd.to_numeric(df['X'], errors='coerce')
df['beta_alpha'] = pd.to_numeric(df['beta_alpha'], errors='coerce')
df['beta_beta'] = pd.to_numeric(df['beta_beta'], errors='coerce')

# Filter valid rows
df_clean = df.dropna(subset=['alpha', 'beta', 'gamma', 'X', 'beta_alpha', 'beta_beta']).copy()
df_clean = df_clean.reset_index(drop=True)

# ------------------------------
# Train/Validation Split
# ------------------------------
train_df = df_clean.sample(frac=1, random_state=42).reset_index(drop=True)


# ------------------------------
# Extract Train Data
# ------------------------------
alpha_data = np.clip(train_df['alpha'].values, 1e-6, None)
beta_data = np.clip(train_df['beta'].values, 1e-6, None)
gamma_data = np.clip(train_df['gamma'].values, 0.2, 5)
X_data = train_df['X'].astype(int).values

beta_alpha_raw = train_df['beta_alpha'].values
beta_beta_raw = train_df['beta_beta'].values

# ------------------------------
# PyMC Model with Sportsbook Blending (Weight = 1.0)
# ------------------------------
w = 1.0

with pm.Model() as model:
    r1 = pm.Uniform('r1', lower=-1, upper=4)
    r2 = pm.Uniform('r2', lower=-4, upper=4)
    r3 = pm.Uniform('r3', lower=-1, upper=4)
    delta = pm.Uniform('delta', lower=0, upper=4)

    log_m = pm.Normal("log_m", mu=2.5, sigma=1.0)
    m = pm.Deterministic("m", pm.math.minimum(pm.math.exp(log_m), 50.0))

    lambda_ = alpha_data ** r1 * beta_data ** r2 * gamma_data ** r3
    lambda_delta = lambda_ * delta

    a_model = m * lambda_delta
    b_model = m

    scale_alpha = pm.Deterministic("scale_alpha", pm.math.mean(a_model) / np.mean(beta_alpha_raw))
    scale_beta = pm.Deterministic("scale_beta", pm.math.mean(b_model) / np.mean(beta_beta_raw))

    a_total = a_model + w * scale_alpha * beta_alpha_raw
    b_total = b_model + w * scale_beta * beta_beta_raw

    p_s = pm.Beta('p_s', alpha=a_total, beta=b_total, shape=len(X_data))
    X_obs = pm.Bernoulli('X_obs', p=p_s, observed=X_data)

    trace = pm.sample(
        draws=1000,
        tune=1000,
        target_accept=0.97,
        cores=4,
        random_seed=42,
        return_inferencedata=True,
        idata_kwargs={"log_likelihood": True}
    )

# ------------------------------
# Prediction Function
# ------------------------------
def predict_win_probability_from_trace(trace, alpha, beta, gamma):
    posterior = trace.posterior
    r1 = posterior['r1'].stack(sample=("chain", "draw")).values
    r2 = posterior['r2'].stack(sample=("chain", "draw")).values
    r3 = posterior['r3'].stack(sample=("chain", "draw")).values
    delta = posterior['delta'].stack(sample=("chain", "draw")).values
    m = posterior['m'].stack(sample=("chain", "draw")).values

    n_samples = len(r1)
    n_games = len(alpha)

    alpha_exp = np.tile(alpha, (n_samples, 1))
    beta_exp = np.tile(beta, (n_samples, 1))
    gamma_exp = np.tile(gamma, (n_samples, 1))

    lambda_ = (alpha_exp ** r1[:, None]) * (beta_exp ** r2[:, None]) * (gamma_exp ** r3[:, None])
    lambda_delta = lambda_ * delta[:, None]

    a = m[:, None] * lambda_delta
    b = m[:, None]

    p_samples = np.random.beta(a, b)
    p_mean = p_samples.mean(axis=0)

    return p_mean, p_samples

# ------------------------------
# Validation Scoring
# ------------------------------
val_alpha = np.clip(val_df['alpha'].values, 1e-6, None)
val_beta = np.clip(val_df['beta'].values, 1e-6, None)
val_gamma = np.clip(val_df['gamma'].values, 0.2, 5)
val_X = val_df['X'].astype(int).values

p_mean, p_samples = predict_win_probability_from_trace(trace, val_alpha, val_beta, val_gamma)

p_hard = (p_mean > 0.5).astype(int)
score_hard = 1 - np.mean((p_hard - val_X) ** 2)
score_soft = 1 - np.mean((p_mean - val_X) ** 2)

print(f"→ Final Model (weight={w}) | Hard Score: {score_hard:.4f} | Soft Score: {score_soft:.4f}")


Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [r1, r2, r3, delta, log_m, p_s]


Output()

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 80 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


→ Final Model (weight=1.0) | Hard Score: 0.5433 | Soft Score: 0.7517


In [None]:
t

Validation Score (Hard): 0.5576923076923077
Validation Score (Soft): nan


NOW LET'S SEE HOW WE DID

In [21]:
filtered_test_data = test_data[
    (test_data['homeTeam'] == 'LA Dodgers') | (test_data['awayTeam'] == 'LA Dodgers')
].copy()

filtered_test_data

Unnamed: 0,gameId,startDate,homeTeam,awayTeam,homePitcher,awayPitcher,homeScore,awayScore,prophetx_opening_homeOdds,prophetx_opening_awayOdds,...,awayTeam_batting_avg,homePitcher_normalized,awayPitcher_normalized,game_year,homePitcher_era,awayPitcher_era,alpha,beta,gamma,X
0,341271,2025-03-29 02:10:00+00:00,LA Dodgers,Detroit,Yoshinobu Yamamoto,Jack Flaherty,8,5,-198,178,...,0.264706,Yoshinobu Yamamoto,Jack Flaherty,2025,2.7,3.18,inf,0.835089,1.177778,1
15,341396,2025-04-07 22:45:00+00:00,Washington,LA Dodgers,MacKenzie Gore,Dustin May,6,4,134,-150,...,0.225397,Mackenzie Gore,Dustin May,2025,2.45,0.0,0.428021,0.959267,0.0,1
23,341421,2025-04-09 20:05:00+00:00,Washington,LA Dodgers,Jake Irvin,Landon Knack,5,6,156,-11000,...,0.231169,Jake Irvin,Landon Knack,2025,5.4,0.0,0.715409,1.00035,0.0,0
29,341450,2025-04-12 02:10:00+00:00,LA Dodgers,Chi. Cubs,Yoshinobu Yamamoto,Matthew Boyd,3,0,-176,170,...,0.25,Yoshinobu Yamamoto,Matthew Boyd,2025,1.23,1.59,1.231317,0.921052,1.292683,1
38,341490,2025-04-15 02:10:00+00:00,LA Dodgers,Colorado,Dustin May,Antonio Senzatela,5,3,-340,290,...,0.217742,Dustin May,Antonio Senzatela,2025,1.06,5.89,3.125,1.03354,5.556604,1
66,341614,2025-04-23 23:00:00+00:00,Chi. Cubs,LA Dodgers,Matthew Boyd,Ben Casparius,7,6,-110,-160,...,0.227334,Matthew Boyd,Ben Casparius,2025,2.01,4.11,0.832857,1.1335,2.044776,1
85,341679,2025-04-29 02:10:00+00:00,LA Dodgers,Miami,Dustin May,Edward Cabrera,7,6,-295,125,...,0.255615,Dustin May,Edward Cabrera,2025,3.95,7.23,1.778667,0.932269,1.83038,1
96,341727,2025-05-02 23:15:00+00:00,Atlanta,LA Dodgers,Grant Holmes,Yoshinobu Yamamoto,1,2,110,-166,...,0.257722,Grant Holmes,Yoshinobu Yamamoto,2025,4.5,1.06,0.67052,0.951878,0.235556,0


In [15]:
((test_data['alpha'] == 0) | (test_data['beta'] == 0) | (test_data['gamma'] == 0)).sum()


8

In [19]:
import numpy as np
import pandas as pd

# Replace with your actual column names
columns_to_check = ['alpha', 'beta', 'gamma']

# Create a mask for problematic values
mask = (
    (test_data[columns_to_check] == 0) |  # check for zero
    test_data[columns_to_check].isna() |  # check for NaN/None
    ~np.isfinite(test_data[columns_to_check])  # check for inf/-inf
)

# Any column has an issue in a row?
bad_rows = mask.any(axis=1)

# Get the indexes of those rows, sorted
bad_indexes = test_data[bad_rows].index.sort_values()

# Print or return
print("Indexes with 0, inf, or None in specified columns:", bad_indexes.tolist())


Indexes with 0, inf, or None in specified columns: [0, 2, 7, 9, 11, 15, 16, 20, 23, 26, 36, 67]


In [20]:
dodgers_mask = (test_data['homeTeam'] == 'LA Dodgers') | (test_data['awayTeam'] == 'LA Dodgers')
dodgers_indexes = test_data[dodgers_mask].index

print("Indexes with LA Dodgers as home or away team:", dodgers_indexes.tolist())


Indexes with LA Dodgers as home or away team: [0, 15, 23, 29, 38, 66, 85, 96]
