In [1]:
# park factor stability

# think about triples.  how much noise in a park factor?  how does this change as we add more years

In [2]:
# Start with the baseline triples rate (per pa)
# Assume a real park effect (e.g., 1.1) # for now do a neutral PF of 1
# Compute expected triples rate at home and on road
# Run many iterations of:
    # Simulate a season's worth of at bats at home and on road
    # Compute observed PF
# Compute metrics on observations

In [3]:
import pandas as pd
from io import StringIO
import numpy as np
import pyretro.boxball_loader as bbl

In [4]:
# Start with the baseline triples rate (per pa)
csv_data = """Year,Tms,#Bat,BatAge,R/G,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,TB,GDP,HBP,SH,SF,IBB
2021,30,1272,28.5,4.50,3704,138327,123103,16681,29839,5929,494,4479,15882,1677,545,12109,32426,.242,.316,.408,.724,50193,2492,1629,576,859,508
2020,30,618,28.0,4.65,1796,66506,59030,8344,14439,2823,241,2304,7978,885,292,6092,15586,.245,.322,.418,.740,24656,1237,821,126,402,202
2019,30,1287,27.9,4.83,4858,186517,166651,23467,42039,8531,785,6776,22471,2280,832,15895,42823,.252,.323,.435,.758,72468,3463,1984,776,1150,753
2018,30,1271,28.1,4.45,4862,185139,165432,21630,41018,8264,847,5585,20606,2474,958,15686,41207,.248,.318,.409,.728,67731,3457,1922,823,1235,929
2017,30,1229,28.3,4.65,4860,185295,165567,22582,42215,8397,795,6105,21558,2527,934,15829,40104,.255,.324,.426,.750,70517,3804,1763,925,1168,970
2016,30,1247,28.4,4.48,4856,184580,165561,21744,42276,8254,873,5610,20745,2537,1001,15088,38982,.255,.322,.417,.739,69106,3719,1651,1025,1214,932"""

df = pd.read_csv(StringIO(csv_data))
sums = df.sum()
sums['1B'] = sums['H'] - sums['2B'] - sums['3B'] - sums['HR']
sums

Year       12111.000
Tms          180.000
#Bat        6924.000
BatAge       169.200
R/G           27.560
G          24936.000
PA        946364.000
AB        845344.000
R         114448.000
H         211826.000
2B         42198.000
3B          4035.000
HR         30859.000
RBI       109240.000
SB         12380.000
CS          4562.000
BB         80699.000
SO        211128.000
BA             1.497
OBP            1.925
SLG            2.513
OPS            4.439
TB        354671.000
GDP        18172.000
HBP         9770.000
SH          4251.000
SF          6028.000
IBB         4294.000
1B        134734.000
dtype: float64

In [5]:
rates = {}
stats = ['1B', '2B', '3B', 'HR', 'BB']
for stat in stats:
    rates[stat] = sums[stat]/sums['PA']
rates

{'1B': 0.14237016623624738,
 '2B': 0.04458960822685563,
 '3B': 0.004263687122502547,
 'HR': 0.03260796057331006,
 'BB': 0.08527268577418413}

In [6]:
# Simulate a season's worth of at bats at home and on road
PA_season = sums['PA']/sums['G'] * 81 * 2
PA_season_HA = PA_season/2
PA_season_HA

3074.0890279114533

In [7]:
sums['PA']/sums['G']

37.95171639396856

In [8]:
def generate_series_values(n, p, iterations):
    return pd.Series([np.random.binomial(n, p) for _ in range(iterations)])

In [9]:
def generate_pfs(n, p, iterations):
    stat_h = generate_series_values(n, p, iterations)
    stat_a = generate_series_values(n, p, iterations)
    pfs = stat_h/stat_a
    return pfs

generate_pfs(PA_season_HA, rates['3B'], 100)

0     0.900000
1     0.235294
2     0.736842
3     0.666667
4     0.750000
        ...   
95    1.294118
96    1.875000
97    1.166667
98    0.666667
99    0.789474
Length: 100, dtype: float64

In [10]:
generate_pfs(PA_season_HA, rates['3B'], 100*1000).describe()

count    1.000000e+05
mean              inf
std               NaN
min      5.882353e-02
25%      7.647059e-01
50%      1.000000e+00
75%      1.307692e+00
max               inf
dtype: float64

In [11]:
def run_sim(sample_size_in_seasons):
    pfs = {}
    for stat in stats:
        pfs[stat] = generate_pfs(PA_season_HA * sample_size_in_seasons, rates[stat], 100*1000).std()
    return pfs

run_sim(1)

{'1B': 0.06285714422046633,
 '2B': 0.1200253549781364,
 '3B': 0.4866705331014393,
 'HR': 0.1421361972236783,
 'BB': 0.08422662311943024}

In [12]:
output = {i: run_sim(i) for i in range (1, 20)}
#pd.concat(output, axis=1)

In [13]:
output

{1: {'1B': 0.06278510336701283,
  '2B': 0.11974378114988082,
  '3B': 0.5000431324362724,
  'HR': 0.14265361616393823,
  'BB': 0.0841834335360671},
 2: {'1B': 0.04454993254233398,
  '2B': 0.08439107184743327,
  '3B': 0.3043308016803164,
  'HR': 0.09923196107695466,
  'BB': 0.05929586616902091},
 3: {'1B': 0.03645900341318967,
  '2B': 0.0684686298258722,
  '3B': 0.2390229189453656,
  'HR': 0.08068834583321742,
  'BB': 0.04836717311569398},
 4: {'1B': 0.031352883301968705,
  '2B': 0.05919271712150259,
  '3B': 0.2049646532844548,
  'HR': 0.0695661183279779,
  'BB': 0.04194702020675885},
 5: {'1B': 0.027814874101859812,
  '2B': 0.052899654994668616,
  '3B': 0.18094616817225448,
  'HR': 0.06275196167612138,
  'BB': 0.037414831416711675},
 6: {'1B': 0.025484775353102996,
  '2B': 0.04844180238009275,
  '3B': 0.16403991001125456,
  'HR': 0.05683592453053782,
  'BB': 0.03437200820827584},
 7: {'1B': 0.023666980161177992,
  '2B': 0.044775084756066785,
  '3B': 0.1507166442925948,
  'HR': 0.0527550

In [14]:
std_devs = pd.DataFrame(output).T
std_devs

Unnamed: 0,1B,2B,3B,HR,BB
1,0.062785,0.119744,0.500043,0.142654,0.084183
2,0.04455,0.084391,0.304331,0.099232,0.059296
3,0.036459,0.068469,0.239023,0.080688,0.048367
4,0.031353,0.059193,0.204965,0.069566,0.041947
5,0.027815,0.0529,0.180946,0.062752,0.037415
6,0.025485,0.048442,0.16404,0.056836,0.034372
7,0.023667,0.044775,0.150717,0.052755,0.031632
8,0.022119,0.041912,0.140532,0.049101,0.029636
9,0.020915,0.039521,0.132376,0.046642,0.027895
10,0.019803,0.037441,0.126142,0.044123,0.026525


In [15]:
print(std_devs.apply(lambda x: round(x, 3)).to_markdown())

|    |    1B |    2B |    3B |    HR |    BB |
|---:|------:|------:|------:|------:|------:|
|  1 | 0.063 | 0.12  | 0.5   | 0.143 | 0.084 |
|  2 | 0.045 | 0.084 | 0.304 | 0.099 | 0.059 |
|  3 | 0.036 | 0.068 | 0.239 | 0.081 | 0.048 |
|  4 | 0.031 | 0.059 | 0.205 | 0.07  | 0.042 |
|  5 | 0.028 | 0.053 | 0.181 | 0.063 | 0.037 |
|  6 | 0.025 | 0.048 | 0.164 | 0.057 | 0.034 |
|  7 | 0.024 | 0.045 | 0.151 | 0.053 | 0.032 |
|  8 | 0.022 | 0.042 | 0.141 | 0.049 | 0.03  |
|  9 | 0.021 | 0.04  | 0.132 | 0.047 | 0.028 |
| 10 | 0.02  | 0.037 | 0.126 | 0.044 | 0.027 |
| 11 | 0.019 | 0.036 | 0.12  | 0.042 | 0.025 |
| 12 | 0.018 | 0.034 | 0.114 | 0.04  | 0.024 |
| 13 | 0.017 | 0.033 | 0.109 | 0.039 | 0.023 |
| 14 | 0.017 | 0.032 | 0.105 | 0.037 | 0.022 |
| 15 | 0.016 | 0.03  | 0.102 | 0.036 | 0.022 |
| 16 | 0.016 | 0.03  | 0.099 | 0.035 | 0.021 |
| 17 | 0.015 | 0.029 | 0.096 | 0.034 | 0.02  |
| 18 | 0.015 | 0.028 | 0.093 | 0.033 | 0.02  |
| 19 | 0.014 | 0.027 | 0.09  | 0.032 | 0.019 |


In [16]:
# How do these compare to observed component PFs?

glt = bbl.load_gamelog_teams(game_types=bbl.GameType.RS, seasons=bbl.Seasons(2010,2019))
glt['1b'] = glt['h'] - glt['d'] - glt['t'] - glt['hr']
glt.columns

Index(['game_id', 'date', 'double_header', 'yr', 'game_type', 'park_id',
       'team', 'team_league', 'team_game_number', 'runs_scored', 'line_score',
       'ab', 'h', 'd', 't', 'hr', 'rbi', 'sh', 'sf', 'hbp', 'bb', 'ibb', 'k',
       'sb', 'cs', 'gdp', 'ci', 'lob', 'pitchers', 'er', 'ter', 'wp', 'balks',
       'po', 'a', 'e', 'passed', 'db', 'tp', 'manager_id', 'manager_name',
       'starting_pitcher_id', 'starting_pitcher_name', 'batting_1_player_id',
       'batting_1_name', 'batting_1_position', 'batting_2_player_id',
       'batting_2_name', 'batting_2_position', 'batting_3_player_id',
       'batting_3_name', 'batting_3_position', 'batting_4_player_id',
       'batting_4_name', 'batting_4_position', 'batting_5_player_id',
       'batting_5_name', 'batting_5_position', 'batting_6_player_id',
       'batting_6_name', 'batting_6_position', 'batting_7_player_id',
       'batting_7_name', 'batting_7_position', 'batting_8_player_id',
       'batting_8_name', 'batting_8_position', '

In [17]:
stats_glt = ['1b', 'd', 't', 'hr', 'bb']


In [18]:
totals = glt.groupby(['HA', 'team', 'yr'])[stats_glt].sum().stack().unstack(level=['HA'])
pfs = totals['H']/totals['A']
pfs.unstack(level=-1)

Unnamed: 0_level_0,Unnamed: 1_level_0,1b,d,t,hr,bb
team,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ANA,2010,0.997812,0.903448,0.461538,0.802326,0.991453
ANA,2011,1.000000,0.817610,0.700000,0.666667,0.982063
ANA,2012,0.876812,1.068182,1.444444,0.780952,0.753906
ANA,2013,1.076605,0.862069,0.950000,0.885057,0.958801
ANA,2014,0.979675,0.798817,0.550000,0.890244,0.975904
...,...,...,...,...,...,...
WAS,2015,0.969631,1.120000,0.300000,1.058140,1.081081
WAS,2016,0.929487,0.848276,0.933333,1.009901,1.038023
WAS,2017,1.254902,1.059603,0.631579,0.990741,1.029963
WAS,2018,1.036117,1.272000,0.785714,1.010526,1.055375
