# Simulate 2025 Season and Compare

In [1]:
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm

In [2]:
TEAM_ID_MAP = {     # map team id to team name
    108: 'Los Angeles Angels', 109: 'Arizona Diamondbacks', 110: 'Baltimore Orioles',
    111: 'Boston Red Sox', 112: 'Chicago Cubs', 113: 'Cincinnati Reds',
    114: 'Cleveland Guardians', 115: 'Colorado Rockies', 116: 'Detroit Tigers',
    117: 'Houston Astros', 118: 'Kansas City Royals', 119: 'Los Angeles Dodgers',
    120: 'Washington Nationals', 121: 'New York Mets', 133: 'Oakland Athletics',
    134: 'Pittsburgh Pirates', 135: 'San Diego Padres', 136: 'Seattle Mariners',
    137: 'San Francisco Giants', 138: 'St. Louis Cardinals', 139: 'Tampa Bay Rays',
    140: 'Texas Rangers', 141: 'Toronto Blue Jays', 142: 'Minnesota Twins',
    143: 'Philadelphia Phillies', 144: 'Atlanta Braves', 145: 'Chicago White Sox',
    146: 'Miami Marlins', 147: 'New York Yankees', 158: 'Milwaukee Brewers'
}

---
## Load Model and Statistics

In [3]:
model = joblib.load('../data/random_forest.pkl')            # Random Forest model
team_stats = pd.read_csv('../data/team_stats.csv')

df = pd.read_csv('../data/mlb_2015_2025_dataset.csv')       # grab only 2025 stats
df['date'] = pd.to_datetime(df['date'])
schedule_2025 = df[df['date'].dt.year == 2025].copy()

if len(schedule_2025) == 0:
    print("Error: No games found for 2025 in the dataset.")
    exit()

print("Preparing 2025 schedule features...")
schedule_2025 = schedule_2025.merge(
    team_stats, left_on='home_team', right_on='team_name', suffixes=('', '_h')
)

schedule_2025 = schedule_2025.merge(
    team_stats, left_on='away_team', right_on='team_name', suffixes=('_h', '_a')
)

# differential stats
schedule_2025['diff_run_diff'] = schedule_2025['run_diff_h'] - schedule_2025['run_diff_a']
schedule_2025['diff_ops'] = schedule_2025['ops_h'] - schedule_2025['ops_a']
schedule_2025['diff_whip'] = schedule_2025['whip_h'] - schedule_2025['whip_a']
schedule_2025['diff_wins_last_10'] = schedule_2025['wins_last_10_h'] - schedule_2025['wins_last_10_a']
schedule_2025['diff_games_last_7'] = schedule_2025['games_last_7_h'] - schedule_2025['games_last_7_a']
schedule_2025['park_factor'] = schedule_2025['park_factor_h']

features = [
    'temp', 'wind_speed', 'diff_run_diff', 'diff_ops',
    'diff_whip', 'diff_wins_last_10', 'diff_games_last_7', 'park_factor'
]

X = schedule_2025[features].fillna(0)
schedule_2025['home_win_prob'] = model.predict_proba(X)[:, 1]

Preparing 2025 schedule features...


---
## Monte Carlo Simulation
100,000 simulations of the 2025 schedule

In [4]:
n_simulations = 100000
sim_results = []

print(f"Simulating the 2025 season {n_simulations} times...")
for i in tqdm(range(n_simulations)):
    draws = np.random.rand(len(schedule_2025))
    schedule_2025['sim_win'] = (schedule_2025['home_win_prob'] > draws).astype(int)

    h_wins = schedule_2025.groupby('home_team')['sim_win'].sum()
    a_wins = schedule_2025.groupby('away_team')['sim_win'].apply(lambda x: (x == 0).sum())

    total_wins = h_wins.add(a_wins, fill_value=0)
    sim_results.append(total_wins)

Simulating the 2025 season 100000 times...


100%|██████████| 100000/100000 [05:17<00:00, 314.80it/s]


In [5]:
results_df = pd.concat(sim_results, axis=1)

summary = pd.DataFrame({
    'Avg_Wins': results_df.mean(axis=1),
    'P10_Wins': results_df.quantile(0.1, axis=1),
    'P90_Wins': results_df.quantile(0.9, axis=1),
})

summary.index = summary.index.map(TEAM_ID_MAP)
summary = summary.sort_values('Avg_Wins', ascending=False)

print("\n--- 2025 MONTE CARLO PROJECTIONS ---")
print(summary.round(1))

summary.to_csv('../data/projections_2025.csv')
print("\nResults saved to 'projections_2025.csv'")


--- 2025 MONTE CARLO PROJECTIONS ---
                       Avg_Wins  P10_Wins  P90_Wins
home_team                                          
Los Angeles Dodgers        99.8      91.0     108.0
New York Yankees           93.7      85.0     102.0
Toronto Blue Jays          92.9      84.0     101.0
Philadelphia Phillies      92.5      84.0     101.0
Chicago Cubs               92.0      84.0     100.0
Detroit Tigers             91.8      84.0     100.0
Seattle Mariners           91.6      83.0     100.0
Milwaukee Brewers          90.4      82.0      99.0
Boston Red Sox             89.0      81.0      97.0
San Diego Padres           87.7      80.0      96.0
Tampa Bay Rays             85.4      77.0      93.0
New York Mets              84.4      76.0      93.0
Cincinnati Reds            84.2      76.0      92.0
Cleveland Guardians        82.1      74.0      90.0
Kansas City Royals         81.6      73.0      90.0
Texas Rangers              80.9      73.0      89.0
Arizona Diamondbacks      

---
## Compare to 2025 Actual

In [12]:
actual_wins = {
    'Toronto Blue Jays': 94, 'New York Yankees': 94, 'Boston Red Sox': 89, 'Tampa Bay Rays': 77, 'Baltimore Orioles': 75,
    'Cleveland Guardians': 88, 'Detroit Tigers': 87, 'Kansas City Royals': 82, 'Minnesota Twins': 70, 'Chicago White Sox': 60,
    'Seattle Mariners': 90, 'Houston Astros': 87, 'Texas Rangers': 81, 'Oakland Athletics': 76, 'Los Angeles Angels': 72,
    'Philadelphia Phillies': 96, 'New York Mets': 83, 'Miami Marlins': 79, 'Atlanta Braves': 76, 'Washington Nationals': 66,
    'Milwaukee Brewers': 97, 'Chicago Cubs': 92, 'Cincinnati Reds': 83, 'St. Louis Cardinals': 78, 'Pittsburgh Pirates': 71,
    'Los Angeles Dodgers': 93, 'San Diego Padres': 90, 'San Francisco Giants': 81, 'Arizona Diamondbacks': 80, 'Colorado Rockies': 43
}

summary['Actual_Wins'] = summary.index.map(actual_wins)
summary['Error'] = summary['Actual_Wins'] - summary['Avg_Wins']
mae = summary['Error'].abs().mean()         # mean absolute error

print(f"Mean Absolute Error: {mae:.2f} wins")
print("\nComparison (Sorted by Actual Wins):")
print(summary[['Avg_Wins', 'Actual_Wins', 'Error']].sort_values(by='Actual_Wins', ascending=False))

Mean Absolute Error: 3.84 wins

Comparison (Sorted by Actual Wins):
                       Avg_Wins  Actual_Wins     Error
home_team                                             
Milwaukee Brewers      90.38104           97   6.61896
Philadelphia Phillies  92.45867           96   3.54133
Toronto Blue Jays      92.94809           94   1.05191
New York Yankees       93.70315           94   0.29685
Los Angeles Dodgers    99.77245           93  -6.77245
Chicago Cubs           92.02355           92  -0.02355
Seattle Mariners       91.59496           90  -1.59496
San Diego Padres       87.69249           90   2.30751
Boston Red Sox         89.04742           89  -0.04742
Cleveland Guardians    82.13259           88   5.86741
Detroit Tigers         91.84497           87  -4.84497
Houston Astros         77.85614           87   9.14386
New York Mets          84.40998           83  -1.40998
Cincinnati Reds        84.17668           83  -1.17668
Kansas City Royals     81.59428           82   0.405