In [1]:
# Monte-Carlo playoff odds
# Generate my own playoff odds

# For now, I'm focusing on the mechanics of the simulation, and less so on the inputs (e.g., the projected team quality)
# So I'm using 538's win probabilities for each game, rather than computing my own

# I'm also using 538's results/schedule data, because it is so easy to use

import pandas as pd
import numpy as np

In [2]:
# Read in the 538 dataset, which has a row for each game in the current season (played or unplayed)
gms = pd.read_csv('https://projects.fivethirtyeight.com/mlb-api/mlb_elo_latest.csv')
#gms = pd.read_csv('../data/538/mlb-elo/mlb_elo_latest.csv')
gms

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,...,pitcher1_rgs,pitcher2_rgs,pitcher1_adj,pitcher2_adj,rating_prob1,rating_prob2,rating1_post,rating2_post,score1,score2
0,2022-10-05,2022,0,,LAD,COL,1591.773446,1468.916071,0.699597,0.300403,...,,,,,0.721651,0.278349,,,,
1,2022-10-05,2022,0,,SEA,DET,1520.313899,1463.528474,0.614209,0.385791,...,,,,,0.607621,0.392379,,,,
2,2022-10-05,2022,0,,SDP,SFG,1515.888286,1529.360457,0.515146,0.484854,...,,,,,0.568112,0.431888,,,,
3,2022-10-05,2022,0,,NYM,WSN,1532.721422,1438.634452,0.663686,0.336314,...,,,,,0.675042,0.324958,,,,
4,2022-10-05,2022,0,,MIL,ARI,1519.148722,1466.833969,0.608093,0.391907,...,,,,,0.629332,0.370668,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2425,2022-04-07,2022,0,,ATL,CIN,1555.630840,1501.967218,0.609942,0.390058,...,58.198554,53.297336,18.664382,15.512738,0.620108,0.379892,1552.570297,1501.193092,3.0,6.0
2426,2022-04-07,2022,0,,WSN,NYM,1476.319846,1495.202033,0.507365,0.492635,...,46.506602,48.182760,-10.890192,-33.183129,0.495889,0.504111,1467.302390,1522.210391,1.0,5.0
2427,2022-04-07,2022,0,,STL,PIT,1524.880454,1456.114951,0.630416,0.369584,...,57.273136,46.669517,27.921385,2.182563,0.650312,0.349688,1503.439418,1444.031029,9.0,0.0
2428,2022-04-07,2022,0,,KCR,CLE,1480.923133,1501.256999,0.505276,0.494724,...,50.288294,59.572636,7.862364,30.139987,0.476089,0.523911,1473.144618,1491.474766,3.0,1.0


In [3]:
gms.columns

Index(['date', 'season', 'neutral', 'playoff', 'team1', 'team2', 'elo1_pre',
       'elo2_pre', 'elo_prob1', 'elo_prob2', 'elo1_post', 'elo2_post',
       'rating1_pre', 'rating2_pre', 'pitcher1', 'pitcher2', 'pitcher1_rgs',
       'pitcher2_rgs', 'pitcher1_adj', 'pitcher2_adj', 'rating_prob1',
       'rating_prob2', 'rating1_post', 'rating2_post', 'score1', 'score2'],
      dtype='object')

In [4]:
# Split out the games that have been played vs those remaining
played = gms.dropna(subset=['score1']) # games that have a score
remain = gms.loc[gms.index.difference(played.index)] # all other games
played.shape, remain.shape

((1299, 26), (1131, 26))

# Define some functions that will be used in the simulation

In [5]:
def compute_standings(gms_played):
    margins = gms_played['score1']-gms_played['score2']
    winners = pd.Series(np.where(margins>0, gms_played['team1'], gms_played['team2']))
    losers  = pd.Series(np.where(margins<0, gms_played['team1'], gms_played['team2']))
    standings = pd.concat([winners.value_counts().rename('W'), losers.value_counts().rename('L')], axis=1)
    return standings

compute_standings(played)

Unnamed: 0,W,L
NYY,61,25
LAD,56,29
HOU,56,29
NYM,54,33
ATL,52,36
SDP,50,38
MIN,48,40
MIL,48,39
STL,47,42
BOS,47,40


In [6]:
#  Create a data frame with the league/division mappings, to use to determine playoff berths
divisions = pd.DataFrame({
'SFG': ['N','NW'],
'LAD': ['N','NW'],
'TBD': ['A','AE'],
'MIL': ['N','NC'],
'HOU': ['A','AW'],
'CHW': ['A','AC'],
'BOS': ['A','AE'],
'NYY': ['A','AE'],
'TOR': ['A','AE'],
'OAK': ['A','AW'],
'SEA': ['A','AW'],
'SDP': ['N','NW'],
'ATL': ['N','NE'],
'CIN': ['N','NC'],
'PHI': ['N','NE'],
'STL': ['N','NC'],
'NYM': ['N','NE'],
'ANA': ['A','AW'],
'CLE': ['A','AC'],
'DET': ['A','AC'],
'CHC': ['N','NC'],
'COL': ['N','NW'],
'KCR': ['A','AC'],
'MIN': ['A','AC'],
'FLA': ['N','NE'],
'WSN': ['N','NE'],
'TEX': ['A','AW'],
'PIT': ['N','NC'],
'BAL': ['A','AE'],
'ARI': ['N','NW']
 }).T

divisions.columns = ['lg', 'div']
divisions

Unnamed: 0,lg,div
SFG,N,NW
LAD,N,NW
TBD,A,AE
MIL,N,NC
HOU,A,AW
CHW,A,AC
BOS,A,AE
NYY,A,AE
TOR,A,AE
OAK,A,AW


In [7]:

def sim_rem_games(remain):
    # Generate a random number for each game
    randoms = pd.Series(np.random.rand(len(remain)), index=remain.index)

    # Figure out the winners and losers
    winners = pd.Series(np.where(randoms<remain['rating_prob1'], remain['team1'], remain['team2']))
    losers = pd.Series(np.where(randoms>remain['rating_prob1'], remain['team1'], remain['team2']))

    # Compute and return the standings
    standings = pd.concat([winners.value_counts().rename('W'), losers.value_counts().rename('L')], axis=1)
    for col in standings.columns: # convert to int
        standings[col] = standings[col].fillna(0).astype(int)
    return standings

sim_rem_games(remain)

Unnamed: 0,W,L
NYY,54,22
LAD,47,30
HOU,46,31
MIL,46,29
ATL,44,30
CLE,43,34
NYM,43,32
SFG,42,35
BOS,42,33
CHW,40,36


In [8]:
cur_standings = compute_standings(played)
rem_standings = sim_rem_games(remain)
full_standings = cur_standings+rem_standings
full_standings

Unnamed: 0,W,L
ANA,69,93
ARI,72,90
ATL,87,75
BAL,77,85
BOS,78,84
CHC,59,103
CHW,86,76
CIN,65,97
CLE,88,74
COL,67,95


In [9]:
# find playoff teams
def add_playoff_seeds(standings):
    standings['wpct'] = standings['W'] / (standings['W'] + standings['L'])

    # Merge in the div/lg data
    standings['div'] = divisions['div']
    standings['lg'] = divisions['lg']

    # Rather than model out all the tie-breakers, I'm assuming that they are all random (not exactly true, but close enough),
    # and so I'm just generating a random number for each team, and we break ties by comparing that random num for each of the tied teams.
    # This is *so* much simpler and faster than modeling all the different scenarios.
    # It might be worth modeling them out with 1-2 days left in the season, but for most of the season, I way prefer using the random num to break ties
    standings['rand'] = np.random.rand(len(standings))

    # Now sort, and break ties using the rand
    sorted = standings.sort_values(by=['wpct', 'rand'], ascending=False)

    # div_rank is nice to have, but somewhat expensive to compute
    #standings['div_rank'] = sorted.groupby('div').cumcount()+1
    #standings['div_win'] = standings['div_rank'] == 1

    # Set div_win False as default, then set it True for div winners
    standings['div_win'] = False
    standings.loc[sorted.groupby('div').head(1).index, 'div_win'] = True
    standings['lg_rank'] = standings.sort_values(by=['div_win', 'wpct', 'rand'], ascending=False).groupby('lg').cumcount()+1
    return standings.sort_values(['lg', 'lg_rank'])

     

add_playoff_seeds(full_standings)

Unnamed: 0,W,L,wpct,div,lg,rand,div_win,lg_rank
NYY,109,53,0.67284,AE,A,0.059357,True,1
HOU,101,61,0.623457,AW,A,0.931768,True,2
MIN,88,74,0.54321,AC,A,0.898541,True,3
CLE,88,74,0.54321,AC,A,0.409675,False,4
CHW,86,76,0.530864,AC,A,0.689347,False,5
TBD,83,79,0.512346,AE,A,0.794834,False,6
TOR,80,82,0.493827,AE,A,0.511166,False,7
SEA,79,83,0.487654,AW,A,0.134036,False,8
BOS,78,84,0.481481,AE,A,0.534116,False,9
BAL,77,85,0.475309,AE,A,0.332646,False,10


In [10]:
%%prun -s cumulative # This runs the code profiler, which creates data I can use to find opportunities for me to speed up the code

[add_playoff_seeds(full_standings) for _ in range(1000)]
None # This is to suppress printing the output, which is 1000 lines of the same list of teams

 

         11604933 function calls (11473933 primitive calls) in 5.807 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    5.807    5.807 {built-in method builtins.exec}
        1    0.003    0.003    5.807    5.807 <string>:3(<module>)
        1    0.004    0.004    5.804    5.804 <string>:3(<listcomp>)
     1000    0.028    0.000    5.801    0.006 <ipython-input-9-324393e1c65c>:2(add_playoff_seeds)
     3000    0.005    0.000    2.503    0.001 _decorators.py:302(wrapper)
     3000    0.026    0.000    2.496    0.001 frame.py:6269(sort_values)
     3000    0.096    0.000    1.760    0.001 sorting.py:285(lexsort_indexer)
     7000    0.050    0.000    1.279    0.000 categorical.py:365(__init__)
    16000    0.044    0.000    1.272    0.000 frame.py:3463(__getitem__)
     1000    0.006    0.000    1.169    0.001 groupby.py:3040(cumcount)
51000/35000    0.029    0.000    0.921    0.000 groupby.py:90

In [11]:
def finish_one_season(incoming_standings, remain):
    rem_standings = sim_rem_games(remain)
    full_standings = incoming_standings+rem_standings
    full_standings = add_playoff_seeds(full_standings)
    return full_standings

finish_one_season(cur_standings, remain)

Unnamed: 0,W,L,wpct,div,lg,rand,div_win,lg_rank
HOU,109,53,0.67284,AW,A,0.693724,True,1
NYY,106,56,0.654321,AE,A,0.001889,True,2
MIN,89,73,0.549383,AC,A,0.281381,True,3
BOS,96,66,0.592593,AE,A,0.793571,False,4
ANA,85,77,0.524691,AW,A,0.043498,False,5
SEA,83,79,0.512346,AW,A,0.757235,False,6
TBD,82,80,0.506173,AE,A,0.381153,False,7
TOR,80,82,0.493827,AE,A,0.980081,False,8
CLE,80,82,0.493827,AC,A,0.896961,False,9
CHW,75,87,0.462963,AC,A,0.613378,False,10


In [12]:

def sim_1_season(incoming_standings, remain, i):
    standings = finish_one_season(incoming_standings, remain)
    standings['iter'] = i
    standings = standings.reset_index().rename(columns={'index': 'team'}).set_index(['team', 'iter'])
    return standings

def sim_n_seasons(incoming_standings, remain, n):
    return pd.concat([sim_1_season(incoming_standings, remain, i) for i in range(n)])

sim_results = sim_n_seasons(cur_standings, remain, 10)
sim_results

Unnamed: 0_level_0,Unnamed: 1_level_0,W,L,wpct,div,lg,rand,div_win,lg_rank
team,iter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NYY,0,105,57,0.648148,AE,A,0.929615,True,1
HOU,0,102,60,0.629630,AW,A,0.676555,True,2
CLE,0,82,80,0.506173,AC,A,0.848679,True,3
TBD,0,94,68,0.580247,AE,A,0.439670,False,4
CHW,0,82,80,0.506173,AC,A,0.222803,False,5
...,...,...,...,...,...,...,...,...,...
PIT,9,73,89,0.450617,NC,N,0.539909,False,11
COL,9,72,90,0.444444,NW,N,0.514985,False,12
CHC,9,68,94,0.419753,NC,N,0.905395,False,13
CIN,9,63,99,0.388889,NC,N,0.792847,False,14


In [36]:
# Count the number of div/wc/playoff appearances by team from a set of results

# Championship weights by seed position
weights = {i: 1/16 for i in range(1,7)}
weights[1] = 1/8
weights[2] = 1/8

def summarize_sim_results(df_results):
    counts = df_results.query('lg_rank <= 6').reset_index()[['team', 'lg_rank']].value_counts().unstack()
    mean_wins = sim_results.groupby('team')['W'].mean().rename('mean_wins')
    summary = pd.merge(left=mean_wins, right=counts, on='team', how='left')
    for col in counts.columns:
        summary[col] = summary[col].fillna(0).astype(int)    

    summary['div_wins'] = summary[range(1, 4)].sum(axis=1)
    summary['playoffs'] = summary[range(1, 7)].sum(axis=1)
    summary['champ_shares'] = (summary[range(1,7)] * np.array(weights)).sum(axis=1)
    return summary

summarize_sim_results(sim_results)

Unnamed: 0_level_0,mean_wins,1,2,3,4,5,6,div_wins,playoffs,champ_shares
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ANA,74.79373,0,0,0,121,464,1112,0,1697,106.0625
ARI,70.70098,0,0,0,0,5,45,0,50,3.125
ATL,94.58471,3956,34279,2279,31464,18247,7069,40514,97294,8470.5625
BAL,74.80012,0,0,0,226,536,1193,0,1955,122.1875
BOS,86.74174,10,49,2,32755,22777,16671,61,72264,4520.1875
CHC,68.95488,0,0,12,0,2,19,12,33,2.0625
CHW,82.84642,2,66,23107,1637,5329,9677,23175,39818,2492.875
CIN,66.73769,0,0,6,0,0,1,6,7,0.4375
CLE,81.10175,0,26,13018,1211,3963,7739,13044,25957,1623.9375
COL,68.2709,0,0,0,0,0,9,0,9,0.5625


In [18]:
#%%prun -s cumulative # This runs the code profiler, which creates data I can use to find opportunities for me to speed up the code

sim_results = sim_n_seasons(cur_standings, remain, 100*1000)
summarize_sim_results(sim_results)

Unnamed: 0_level_0,mean_wins,1,2,3,4,5,6,div_wins,playoffs
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ANA,74.79373,0,0,0,121,464,1112,0,1697
ARI,70.70098,0,0,0,0,5,45,0,50
ATL,94.58471,3956,34279,2279,31464,18247,7069,40514,97294
BAL,74.80012,0,0,0,226,536,1193,0,1955
BOS,86.74174,10,49,2,32755,22777,16671,61,72264
CHC,68.95488,0,0,12,0,2,19,12,33
CHW,82.84642,2,66,23107,1637,5329,9677,23175,39818
CIN,66.73769,0,0,6,0,0,1,6,7
CLE,81.10175,0,26,13018,1211,3963,7739,13044,25957
COL,68.2709,0,0,0,0,0,9,0,9


In [37]:
summary = summarize_sim_results(sim_results)
print(summary.to_string())

      mean_wins      1      2      3      4      5      6  div_wins  playoffs  champ_shares
team                                                                                       
ANA    74.79373      0      0      0    121    464   1112         0      1697      106.0625
ARI    70.70098      0      0      0      0      5     45         0        50        3.1250
ATL    94.58471   3956  34279   2279  31464  18247   7069     40514     97294     8470.5625
BAL    74.80012      0      0      0    226    536   1193         0      1955      122.1875
BOS    86.74174     10     49      2  32755  22777  16671        61     72264     4520.1875
CHC    68.95488      0      0     12      0      2     19        12        33        2.0625
CHW    82.84642      2     66  23107   1637   5329   9677     23175     39818     2492.8750
CIN    66.73769      0      0      6      0      0      1         6         7        0.4375
CLE    81.10175      0     26  13018   1211   3963   7739     13044     25957   

In [20]:
# How many games does each team win in each seeding?
sim_results.query('lg_rank <= 6').groupby(['team', 'lg_rank'])['W'].mean().unstack()

lg_rank,1,2,3,4,5,6
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ANA,,,,87.049587,85.551724,83.899281
ARI,,,,,86.6,83.644444
ATL,101.533873,97.603752,93.936376,94.301901,91.493232,88.836752
BAL,,,,87.362832,85.152985,83.788768
BOS,99.2,97.183673,94.0,90.784064,87.579356,85.491332
CHC,,,81.666667,,86.0,83.578947
CHW,102.0,93.742424,87.41468,87.880269,86.281854,84.76904
CIN,,,80.5,,,83.0
CLE,,92.846154,86.943232,87.689513,86.045168,84.534436
COL,,,,,,83.0


In [26]:
# How many wins do teams have in division-winning seasons?
sim_results.query('div_win').groupby('team')['W'].mean()

team
ATL     97.781211
BOS     97.409836
CHC     81.666667
CHW     87.433959
CIN     80.500000
CLE     86.954998
DET     82.222222
FLA     91.705882
HOU    102.418110
KCR     80.333333
LAD    104.625526
MIL     90.486418
MIN     88.825148
NYM     98.322561
NYY    107.531898
PHI     95.158846
PIT     81.250000
SDP     98.848875
SEA     94.050761
SFG     95.709677
STL     89.337450
TBD     98.090909
TEX     89.250000
TOR     95.961538
Name: W, dtype: float64

In [22]:
# How often do teams win the division when they win 95 games?
finishes = sim_results.query('W>=95').groupby('team').agg(num_seasons=('div_win', len), div_wins=('div_win', sum))
finishes['pct_win'] = finishes['div_wins']/finishes['num_seasons']
finishes


Unnamed: 0_level_0,num_seasons,div_wins,pct_win
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATL,51113,34644,0.677792
BOS,3361,51,0.015174
CHW,272,272,1.0
CLE,92,92,1.0
FLA,1,0,0.0
HOU,96680,96633,0.999514
LAD,99041,97621,0.985663
MIL,8898,8851,0.994718
MIN,3415,3415,1.0
NYM,63420,49669,0.783176


In [23]:
pads95 = sim_results.query('team=="SDP" and W>=95').reset_index()['iter']
sim_results.query('iter in @pads95 and div=="NW"').groupby('team')['W'].mean()

team
ARI     69.939634
COL     67.459166
LAD    103.479976
SDP     96.863869
SFG     81.632294
Name: W, dtype: float64