In [1]:
# park factor stability

# think about triples.  how much noise in a park factor?  how does this change as we add more years

In [2]:
# Start with the baseline triples rate (per pa)
# Assume a real park effect (e.g., 1.1) # for now do a neutral PF of 1
# Compute expected triples rate at home and on road
# Run many iterations of:
    # Simulate a season's worth of at bats at home and on road
    # Compute observed PF
# Compute metrics on observations

In [3]:
import pandas as pd
from io import StringIO
import numpy as np

In [4]:
# Start with the baseline triples rate (per pa)
csv_data = """Year,Tms,#Bat,BatAge,R/G,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,TB,GDP,HBP,SH,SF,IBB
2021,30,1272,28.5,4.50,3704,138327,123103,16681,29839,5929,494,4479,15882,1677,545,12109,32426,.242,.316,.408,.724,50193,2492,1629,576,859,508
2020,30,618,28.0,4.65,1796,66506,59030,8344,14439,2823,241,2304,7978,885,292,6092,15586,.245,.322,.418,.740,24656,1237,821,126,402,202
2019,30,1287,27.9,4.83,4858,186517,166651,23467,42039,8531,785,6776,22471,2280,832,15895,42823,.252,.323,.435,.758,72468,3463,1984,776,1150,753
2018,30,1271,28.1,4.45,4862,185139,165432,21630,41018,8264,847,5585,20606,2474,958,15686,41207,.248,.318,.409,.728,67731,3457,1922,823,1235,929
2017,30,1229,28.3,4.65,4860,185295,165567,22582,42215,8397,795,6105,21558,2527,934,15829,40104,.255,.324,.426,.750,70517,3804,1763,925,1168,970
2016,30,1247,28.4,4.48,4856,184580,165561,21744,42276,8254,873,5610,20745,2537,1001,15088,38982,.255,.322,.417,.739,69106,3719,1651,1025,1214,932"""

df = pd.read_csv(StringIO(csv_data))
sums = df.sum()


In [5]:
rates = {}
stats = ['H', '3B', 'HR']
for stat in stats:
    rates[stat] = sums[stat]/sums['PA']
rates

{'H': 0.2238314221589156,
 '3B': 0.004263687122502547,
 'HR': 0.03260796057331006}

In [6]:
# Simulate a season's worth of at bats at home and on road
PA_season = sums['PA']/sums['G'] * 81 * 2
PA_season

6148.178055822907

In [7]:
sums['PA']/sums['G']

37.95171639396856

In [8]:
def generate_series_values(n, p, iterations):
    return pd.Series([np.random.binomial(n, p) for _ in range(iterations)])

In [9]:
def generate_pfs(n, p, iterations):
    stat_h = generate_series_values(n, p, iterations)
    stat_a = generate_series_values(n, p, iterations)
    pfs = stat_h/stat_a
    return pfs

generate_pfs(PA_season, rates['3B'], 100)

0     1.291667
1     1.000000
2     1.038462
3     1.172414
4     0.750000
        ...   
95    1.687500
96    1.437500
97    1.032258
98    0.677419
99    0.787879
Length: 100, dtype: float64

In [10]:
generate_pfs(PA_season, rates['3B'], 100*1000).describe()

count    100000.000000
mean          1.041498
std           0.303268
min           0.285714
25%           0.827586
50%           1.000000
75%           1.208333
max           4.500000
dtype: float64

In [11]:
generate_pfs(PA_season*2, rates['3B'], 100*1000).std()

0.20453225545531165

In [12]:
def run_sim(sample_size_in_seasons):
    pfs = {}
    for stat in stats:
        pfs[stat] = generate_pfs(PA_season* sample_size_in_seasons, rates[stat], 100*1000).std()
    return pfs

run_sim(1)

{'H': 0.033671196728000896,
 '3B': 0.30384037005548864,
 'HR': 0.09922875031064192}

In [13]:
output = {i: run_sim(i) for i in range (1, 20)}
#pd.concat(output, axis=1)

In [14]:
output

{1: {'H': 0.033661959045663596,
  '3B': 0.304329304138098,
  'HR': 0.09943737106323497},
 2: {'H': 0.02382913516102559,
  '3B': 0.20427932807572144,
  'HR': 0.06979612172086307},
 3: {'H': 0.019356327188023503,
  '3B': 0.16377789813951144,
  'HR': 0.057056785635989274},
 4: {'H': 0.01676069289892459,
  '3B': 0.14088045941723906,
  'HR': 0.04905840368834532},
 5: {'H': 0.01503912106804281,
  '3B': 0.12586701705366057,
  'HR': 0.04403451244531306},
 6: {'H': 0.013693882574248176,
  '3B': 0.11398869495093711,
  'HR': 0.0401778693322492},
 7: {'H': 0.012655731735301585,
  '3B': 0.10557164316349041,
  'HR': 0.03724494657798437},
 8: {'H': 0.011895701116349022,
  '3B': 0.09897247774109762,
  'HR': 0.034800522773102915},
 9: {'H': 0.011184506085412724,
  '3B': 0.09290381373348958,
  'HR': 0.03295883066599289},
 10: {'H': 0.010621747038378124,
  '3B': 0.08790973881034649,
  'HR': 0.031119065463830135},
 11: {'H': 0.010137588485547583,
  '3B': 0.08389923734938129,
  'HR': 0.029665599357493005},

In [15]:
std_devs = pd.DataFrame(output).T
std_devs

Unnamed: 0,H,3B,HR
1,0.033662,0.304329,0.099437
2,0.023829,0.204279,0.069796
3,0.019356,0.163778,0.057057
4,0.016761,0.14088,0.049058
5,0.015039,0.125867,0.044035
6,0.013694,0.113989,0.040178
7,0.012656,0.105572,0.037245
8,0.011896,0.098972,0.034801
9,0.011185,0.092904,0.032959
10,0.010622,0.08791,0.031119


In [16]:
print(std_devs.apply(lambda x: round(x, 3)).to_markdown())

|    |     H |    3B |    HR |
|---:|------:|------:|------:|
|  1 | 0.034 | 0.304 | 0.099 |
|  2 | 0.024 | 0.204 | 0.07  |
|  3 | 0.019 | 0.164 | 0.057 |
|  4 | 0.017 | 0.141 | 0.049 |
|  5 | 0.015 | 0.126 | 0.044 |
|  6 | 0.014 | 0.114 | 0.04  |
|  7 | 0.013 | 0.106 | 0.037 |
|  8 | 0.012 | 0.099 | 0.035 |
|  9 | 0.011 | 0.093 | 0.033 |
| 10 | 0.011 | 0.088 | 0.031 |
| 11 | 0.01  | 0.084 | 0.03  |
| 12 | 0.01  | 0.08  | 0.028 |
| 13 | 0.009 | 0.077 | 0.027 |
| 14 | 0.009 | 0.074 | 0.026 |
| 15 | 0.009 | 0.071 | 0.025 |
| 16 | 0.008 | 0.069 | 0.025 |
| 17 | 0.008 | 0.067 | 0.024 |
| 18 | 0.008 | 0.065 | 0.023 |
| 19 | 0.008 | 0.063 | 0.023 |
