# The Padres struck out 45 batters in three games, August 7-9, 2021.  Was this a team record?

In [1]:
import boxball_loader as bbl
import pandas as pd
import numpy as np

In [2]:
glt = bbl.load_gamelog_teams(game_types=bbl.GameType.ALL, years=range(1800, 3000))

In [3]:
# Just get the answer (but without the date or any other data)
glt[glt['opp']=='SDN'].sort_values(['date', 'double_header']).rolling(3).agg({'k': sum}).max()

k    42.0
dtype: float64

In [4]:
# To get more info, we have to compute the rolling sum, and merge it  with the original df
# This will get the top streak of n games in stat stat from the df
sdn = glt[glt['opp']=='SDN'].sort_values(['date', 'double_header'])

def get_top_streak(df, n, stat, groupby=None):
    grpd = df
    if groupby:
        grpd = df.groupby(groupby)
    merged = pd.merge(grpd, grpd.rolling(n)[stat].sum().rename('sum'), left_index=True, right_index=True)
    high_total = merged['sum'].max()
    best = merged.loc[merged['sum']==high_total, ['date', 'sum']]
    best['gms_ct'] = n

    return best

m = get_top_streak(sdn, 3, 'k')
m


Unnamed: 0,date,sum,gms_ct
8799,2019-09-19,42.0,3


In [5]:
# What about other streak lengths?
pd.concat([get_top_streak(sdn, i, 'k') for i in range(2, 30)], axis=0).reset_index(drop=True).set_index('gms_ct')

Unnamed: 0_level_0,date,sum
gms_ct,Unnamed: 1_level_1,Unnamed: 2_level_1
2,2001-06-19,33.0
3,2019-09-19,42.0
4,2019-09-20,55.0
5,2019-09-20,65.0
6,2019-09-20,74.0
6,2019-09-21,74.0
6,2019-09-22,74.0
7,2019-09-22,84.0
8,2019-09-25,95.0
9,2019-09-26,106.0


In [6]:
# Now let's look just for starting pitchers
# For this we need the retrosheet dailies

dailies = bbl.load_dailies_pit(game_types=bbl.GameType.ALL)
dailies.columns

Index(['game_id', 'game_dt', 'game_ct', 'appearance_dt', 'team_id',
       'player_id', 'slot_ct', 'seq_ct', 'home_fl', 'opponent_id', 'park_id',
       'yr', 'game_type', 'team_game_number', 'p_g', 'p_gs', 'p_cg', 'p_sho',
       'p_gf', 'p_w', 'p_l', 'p_sv', 'p_out', 'p_tbf', 'p_ab', 'p_r', 'p_er',
       'p_h', 'p_tb', 'p_2b', 'p_3b', 'p_hr', 'p_hr4', 'p_bb', 'p_ibb', 'p_so',
       'p_gdp', 'p_hp', 'p_sh', 'p_sf', 'p_xi', 'p_wp', 'p_bk', 'p_ir',
       'p_irs', 'p_go', 'p_ao', 'p_pitch', 'p_strike'],
      dtype='object')

In [7]:
sdn = dailies[dailies['team_id']=='SDN'].sort_values(['game_dt', 'game_ct'])
sdn

Unnamed: 0,game_id,game_dt,game_ct,appearance_dt,team_id,player_id,slot_ct,seq_ct,home_fl,opponent_id,...,p_sf,p_xi,p_wp,p_bk,p_ir,p_irs,p_go,p_ao,p_pitch,p_strike
4095591,SDN196904080,1969-04-08,0,1969-04-08,SDN,selmd101,9,1,True,HOU,...,0.0,0.0,0.0,0,0.0,0.0,6.0,9.0,,
4095612,SDN196904090,1969-04-09,0,1969-04-09,SDN,podrj101,9,1,True,HOU,...,0.0,0.0,0.0,0,0.0,0.0,9.0,11.0,,
4095614,SDN196904090,1969-04-09,0,1969-04-09,SDN,siskt101,9,3,True,HOU,...,0.0,0.0,0.0,0,0.0,0.0,3.0,3.0,,
4095635,SDN196904100,1969-04-10,0,1969-04-10,SDN,kelld102,9,1,True,HOU,...,0.0,0.0,0.0,0,0.0,0.0,6.0,11.0,,
4095636,SDN196904100,1969-04-10,0,1969-04-10,SDN,mccob105,9,3,True,HOU,...,0.0,0.0,0.0,0,2.0,0.0,0.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4209845,SDN202010080,2020-10-08,0,2020-10-08,SDN,pomed001,0,9,True,LAN,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,22.0,11.0
4209847,SDN202010080,2020-10-08,0,2020-10-08,SDN,richg002,0,10,True,LAN,...,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,10.0,7.0
4209848,SDN202010080,2020-10-08,0,2020-10-08,SDN,roset001,0,11,True,LAN,...,0.0,0.0,0.0,0,0.0,0.0,0.0,2.0,29.0,13.0
4209849,SDN202010080,2020-10-08,0,2020-10-08,SDN,stamc001,0,2,True,LAN,...,0.0,0.0,0.0,0,1.0,1.0,1.0,0.0,20.0,11.0


In [8]:
# This will sum across pitchers, so similar to what we did above
sdn.groupby('game_id')['p_so'].sum().rolling(3).sum().max()

42.0

In [9]:
# Now get the top streaks by individual pitcher
def get_top_streak_groupby(df, n, stat, groupby):
    rolling_sums = df.groupby('player_id').rolling(3)['p_so'].sum().rename('sum').reset_index().set_index('level_1').dropna()['sum']
    merged = pd.merge(df, rolling_sums, left_index=True, right_index=True)
    return merged.sort_values(by='sum', ascending=False).head(10)[['player_id', 'appearance_dt', 'sum']]

get_top_streak_groupby(sdn, 3, 'p_so', 'player_id')

Unnamed: 0,player_id,appearance_dt,sum
4101715,kirbc101,1971-09-29,36.0
2151289,peavj001,2007-05-06,36.0
4176569,peavj001,2006-05-22,35.0
4178656,peavj001,2007-04-30,34.0
4180141,peavj001,2007-08-27,33.0
1479360,normf101,1972-09-15,33.0
4307819,lamed001,2020-09-20,32.0
4208869,lamed001,2020-07-25,32.0
180284,browk001,1998-10-08,32.0
4208752,lamed001,2019-09-25,31.0
