In [1]:
import pandas as pd
import boxball_loader as bbl
import baseball_stats_utils as bsu

In [2]:
df = bbl.load_dailies_bat(bbl.GameType.RS | bbl.GameType.PS)
df.columns

Index(['game_id', 'game_dt', 'game_ct', 'appearance_dt', 'team_id',
       'player_id', 'slot_ct', 'seq_ct', 'home_fl', 'opponent_id', 'park_id',
       'yr', 'game_type', 'team_game_number', 'b_g', 'b_pa', 'b_ab', 'b_r',
       'b_h', 'b_tb', 'b_2b', 'b_3b', 'b_hr', 'b_hr4', 'b_rbi', 'b_gw', 'b_bb',
       'b_ibb', 'b_so', 'b_gdp', 'b_hp', 'b_sh', 'b_sf', 'b_sb', 'b_cs',
       'b_xi', 'b_g_dh', 'b_g_ph', 'b_g_pr'],
      dtype='object')

In [3]:
df['player_gm_ct' ] = df.sort_values(['player_id', 'game_dt']).groupby('player_id').cumcount()
df['player_gm_ct' ].value_counts()

0       18184
1       17417
2       16872
3       16429
4       16066
        ...  
3570        1
3563        1
3562        1
3561        1
3537        1
Name: player_gm_ct, Length: 3629, dtype: int64

In [4]:
no_hr = df.query('b_hr == 0 and yr>=1947')
no_hr_ct = no_hr.sort_values(['player_id', 'game_dt']).groupby('player_id').cumcount()
streak_ids = (no_hr['player_gm_ct'] - no_hr_ct).rename('streak_id')
streaks = no_hr.groupby(['player_id', streak_ids]).agg(pa=('b_pa', sum), first=('game_dt', min), latest=('game_dt', max)).reset_index()
streaks['pa'] = streaks['pa'].astype(int)
streaks['name'] = bsu.get_player_names_col(streaks['player_id'], 'retro_id')
streaks.sample(10)

Unnamed: 0,player_id,streak_id,pa,first,latest,name
47745,cruzn002,113,100,2012-04-18,2012-05-12,Nelson Cruz
118555,laroa001,28,4,2005-09-16,2005-09-16,Adam LaRoche
115001,knoob101,0,8,1964-04-13,1964-04-15,Bobby Knoop
129284,marre001,29,21,2002-06-30,2002-07-05,Eli Marrero
192677,smith106,53,11,1962-08-19,1962-08-23,Hal Smith
148469,murre001,95,30,1980-08-12,1980-08-18,Eddie Murray
70,aaroh101,80,17,1957-06-16,1957-06-18,Hank Aaron
8833,bailb103,10,44,1963-09-04,1963-09-17,Bob Bailey
126493,mabrj001,18,126,1996-09-24,1997-04-27,John Mabry
73377,garrw101,8,17,1970-07-21,1970-07-25,Wayne Garrett


In [5]:
streaks.sort_values(['pa'], ascending=False).head(25)

Unnamed: 0,player_id,streak_id,pa,first,latest,name
172434,remyj001,7,2584,1978-08-22,1984-05-18,Jerry Remy
150846,newma001,1,2260,1986-07-07,1992-10-03,Al Newman
116764,kuipd001,1,2217,1977-08-30,1985-06-27,Duane Kuiper
14965,bellr001,1,2127,1987-05-06,1997-09-25,Rafael Belliard
192897,smito001,1,1994,1978-09-05,1982-04-12,Ozzie Smith
225670,woodw101,0,1941,1963-09-09,1970-07-09,Woody Woodward
72046,gantj001,44,1938,1987-06-15,1991-09-02,Jim Gantner
82980,grosg001,6,1911,1978-05-02,1987-05-26,Greg Gross
82975,grosg001,0,1887,1973-09-05,1977-07-05,Greg Gross
140034,metzr101,5,1849,1975-06-04,1980-08-10,Roger Metzger


In [6]:
print(streaks.sort_values(['pa'], ascending=False).head(25)[['name', 'pa', 'first', 'latest']].to_string(index=False))

           name   pa      first     latest
     Jerry Remy 2584 1978-08-22 1984-05-18
      Al Newman 2260 1986-07-07 1992-10-03
   Duane Kuiper 2217 1977-08-30 1985-06-27
Rafael Belliard 2127 1987-05-06 1997-09-25
    Ozzie Smith 1994 1978-09-05 1982-04-12
 Woody Woodward 1941 1963-09-09 1970-07-09
    Jim Gantner 1938 1987-06-15 1991-09-02
     Greg Gross 1911 1978-05-02 1987-05-26
     Greg Gross 1887 1973-09-05 1977-07-05
  Roger Metzger 1849 1975-06-04 1980-08-10
  Vince Coleman 1797 1985-05-22 1987-08-25
  Frank Taveras 1789 1971-09-25 1977-08-05
  Don Kessinger 1757 1966-05-15 1968-09-03
     Larry Bowa 1744 1970-04-07 1972-08-17
     Hal Lanier 1700 1966-09-06 1970-04-24
     Doug Flynn 1668 1981-05-08 1985-10-05
    Ozzie Smith 1628 1985-10-16 1988-05-30
    Steve Jeltz 1594 1984-09-24 1989-05-20
     Don Sutton 1583 1966-04-14 1988-08-09
       Ron Hunt 1570 1971-09-21 1974-09-28
     Ben Revere 1562 2010-09-07 2014-05-26
  Jamey Carroll 1538 2009-08-13 2012-09-02
   Duane Ku