## How often do position players pitch?

Can we identify a dividing line to separate position players from pitchers?

In [1]:
import pandas as pd
import plotly.express as px

import boxball_loader as bbl
import baseball_stats_utils as bsu

In [2]:
# Group appearances by player
# See how often players appear as pitchers vs non-pitchers

app = bbl.load_appearances().query('year_id >= 1980')

In [3]:
pit = app.groupby('player_id')[['g_all', 'g_defense', 'g_p']].sum()
pit

Unnamed: 0_level_0,g_all,g_defense,g_p
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aardsda01,331,331.0,331
aasedo01,369,369.0,369
abadan01,15,9.0,0
abadfe01,384,384.0,384
abbotgl01,87,87.0,87
...,...,...,...
zumayjo01,171,171.0,171
zuninmi01,705,692.0,0
zupcibo01,319,285.0,0
zuvelpa01,209,197.0,0


In [4]:
pit['g_np'] = pit['g_all']-pit['g_p']
pit

Unnamed: 0_level_0,g_all,g_defense,g_p,g_np
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aardsda01,331,331.0,331,0
aasedo01,369,369.0,369,0
abadan01,15,9.0,0,15
abadfe01,384,384.0,384,0
abbotgl01,87,87.0,87,0
...,...,...,...,...
zumayjo01,171,171.0,171,0
zuninmi01,705,692.0,0,705
zupcibo01,319,285.0,0,319
zuvelpa01,209,197.0,0,209


In [5]:
#both = pit.loc[pit['g_np']>0]
both = pit
both

Unnamed: 0_level_0,g_all,g_defense,g_p,g_np
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aardsda01,331,331.0,331,0
aasedo01,369,369.0,369,0
abadan01,15,9.0,0,15
abadfe01,384,384.0,384,0
abbotgl01,87,87.0,87,0
...,...,...,...,...
zumayjo01,171,171.0,171,0
zuninmi01,705,692.0,0,705
zupcibo01,319,285.0,0,319
zuvelpa01,209,197.0,0,209


In [6]:
both['p_p']= both['g_p']/both['g_all']
both

Unnamed: 0_level_0,g_all,g_defense,g_p,g_np,p_p
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
aardsda01,331,331.0,331,0,1.0
aasedo01,369,369.0,369,0,1.0
abadan01,15,9.0,0,15,0.0
abadfe01,384,384.0,384,0,1.0
abbotgl01,87,87.0,87,0,1.0
...,...,...,...,...,...
zumayjo01,171,171.0,171,0,1.0
zuninmi01,705,692.0,0,705,0.0
zupcibo01,319,285.0,0,319,0.0
zuvelpa01,209,197.0,0,209,0.0


In [7]:
# What's the distribution of percentages
px.line(both['p_p'].sort_values())

OK, so almost all pitchers are at one end or the other.  The only player between 9% and 62% is Kieschnick.

In [8]:

both['p_p'].describe()

count    8839.000000
mean        0.528265
std         0.497246
min         0.000000
25%         0.000000
50%         0.982927
75%         1.000000
max         1.000000
Name: p_p, dtype: float64

In [9]:
# Let's look at the players closest to the middle
both.query('.05 < p_p < .9').sort_values('p_p')


Unnamed: 0_level_0,g_all,g_defense,g_p,g_np,p_p
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
espinsa01,27,24.0,2,25,0.074074
walshja01,66,57.0,5,61,0.075758
annade01,13,12.0,1,12,0.076923
ankieri01,653,587.0,51,602,0.078101
schroma01,11,7.0,1,10,0.090909
kiescbr01,260,126.0,74,186,0.284615
guerrja02,35,30.0,22,13,0.628571
mckaybr01,18,13.0,13,5,0.722222
owingmi01,183,138.0,138,45,0.754098
thompda02,5,4.0,4,1,0.8


In [10]:
both['player_name'] = bsu.get_player_names_df(both, 'player_id')
both

Unnamed: 0_level_0,g_all,g_defense,g_p,g_np,p_p,player_name
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aardsda01,331,331.0,331,0,1.0,David Aardsma
aasedo01,369,369.0,369,0,1.0,Don Aase
abadan01,15,9.0,0,15,0.0,Andy Abad
abadfe01,384,384.0,384,0,1.0,Fernando Abad
abbotgl01,87,87.0,87,0,1.0,Glenn Abbott
...,...,...,...,...,...,...
zumayjo01,171,171.0,171,0,1.0,Joel Zumaya
zuninmi01,705,692.0,0,705,0.0,Mike Zunino
zupcibo01,319,285.0,0,319,0.0,Bob Zupcic
zuvelpa01,209,197.0,0,209,0.0,Paul Zuvella


In [11]:
# How often do position players pitch?
# boxball_loader divides position players from pitchers at 50% of games played

ppp = bbl.load_pitching(player_types=bbl.PlayerType.POSITION)
ppp.shape

(1466, 31)

In [12]:
totals = ppp.groupby('yr').sum()
totals

Unnamed: 0_level_0,stint,w,l,g,gs,cg,sho,sv,ip_outs,h,...,ibb,wp,hbp,bk,bfp,gf,r,sh,sf,gidp
yr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1871,10,15,21,58,35,33,0,4,1075,617,...,0.0,65,0.0,0,1946.0,17,557,0.0,0.0,0.0
1872,8,9,13,39,21,18,0,5,747,396,...,0.0,15,0.0,0,1281.0,16,321,0.0,0.0,0.0
1873,11,4,13,39,18,10,0,4,572,353,...,0.0,4,0.0,1,1047.0,18,283,0.0,0.0,0.0
1874,8,6,24,42,29,23,0,3,812,383,...,0.0,15,0.0,0,1332.0,10,288,0.0,0.0,0.0
1875,29,46,103,200,152,120,4,8,4151,1693,...,0.0,135,0.0,3,6397.0,38,1280,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016,24,0,1,26,0,0,0,0,74,31,...,1.0,2,2.0,0,114.0,24,15,0.0,0.0,1.0
2017,24,0,0,36,0,0,0,0,98,48,...,0.0,3,2.0,1,168.0,32,37,0.0,5.0,3.0
2018,52,4,4,75,10,0,0,0,344,138,...,0.0,8,7.0,2,530.0,55,102,0.0,3.0,9.0
2019,57,0,1,90,0,0,0,1,280,129,...,1.0,2,9.0,0,454.0,85,88,0.0,5.0,11.0


In [13]:
# What's the distribution over time?
px.bar(totals['g'])

In [14]:
# Let's look at those games started; those are suspicious

ppp.query('gs>0 and yr>=2010')

Unnamed: 0,player_id,yr,stint,team_id,lg_id,w,l,g,gs,cg,...,wp,hbp,bk,bfp,gf,r,sh,sf,gidp,franch_id
1255,laneja01,2014,1,SDN,NL,0,1,3,1,0,...,0,0.0,0,39.0,0,1,0.0,0.0,0.0,SDP
1333,ohtansh01,2018,1,LAA,AL,4,2,10,10,0,...,5,1.0,0,211.0,0,19,0.0,1.0,2.0,ANA
1442,ohtansh01,2020,1,LAA,AL,0,1,2,2,0,...,1,0.0,0,16.0,0,7,0.0,0.0,0.0,ANA


Ohtani doesn't meet the threshold of pitching in 50% of the games, so he is counted as a position player.  For most purposes (e.g. when we're looking at hitting stats), that makes sense.  Ohtani doesn't really fit into the binary split of pitcher vs. position player.  For our purposes today, we should probably exclude him.  Are there other two-way players?

In [15]:
yrs = range(1980,3000)
bbl.load_pitching(player_types=bbl.PlayerType.POSITION, years=yrs, coalesce_type=bbl.CoalesceMode.PLAYER_CAREER) \
    .sort_values('g', ascending=False)

Unnamed: 0_level_0,w,l,g,gs,cg,sho,sv,ip_outs,h,er,...,ibb,wp,hbp,bk,bfp,gf,r,sh,sf,gidp
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
kiescbr01,2,2,74,0,0,0,0,288,110,49,...,7.0,3,6.0,0,425.0,24,51,4.0,0.0,6.0
ankieri01,13,10,51,41,0,0,1,726,198,105,...,2.0,20,12.0,2,1039.0,1,119,12.0,9.0,18.0
ohtansh01,4,3,12,12,0,0,0,160,41,26,...,0.0,6,1.0,0,227.0,0,26,0.0,1.0,2.0
gimench01,0,0,11,0,0,0,0,33,19,16,...,0.0,0,0.0,0,53.0,11,16,0.0,0.0,0.0
rominan01,0,0,7,0,0,0,0,17,10,8,...,0.0,2,2.0,0,34.0,6,8,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
hancoga01,0,0,1,0,0,0,0,4,0,0,...,0.0,0,0.0,0,4.0,1,0,0.0,0.0,0.0
happia01,0,0,1,0,0,0,0,3,1,0,...,0.0,0,0.0,0,4.0,1,0,0.0,0.0,0.0
harrijo05,0,0,1,0,0,0,0,1,0,0,...,0.0,0,0.0,0,1.0,1,0,0.0,0.0,0.0
harrile01,0,0,1,0,0,0,0,3,0,0,...,0.0,0,0.0,0,3.0,1,0,0.0,0.0,0.0


In [16]:
bsu.get_player_names_df(_, 'player_id')

player_id
kiescbr01    Brooks Kieschnick
ankieri01          Rick Ankiel
ohtansh01        Shohei Ohtani
gimench01        Chris Gimenez
rominan01        Andrew Romine
                   ...        
hancoga01        Garry Hancock
happia01              Ian Happ
harrijo05        Josh Harrison
harrile01         Lenny Harris
zobribe01          Ben Zobrist
Name: name, Length: 294, dtype: object

OK, looks like we need to exclude Keischnick and Ankiel as well.  They were used as pitchers, not as position players who pitched.  Kieschnick is more of a two-way player, whereas Ankiel converted from one to the other (but bbl just looks at career totals).

In [17]:
# Let's look at the quality of position players pitching

bbl.load_pitching(player_types=bbl.PlayerType.POSITION, years=yrs, coalesce_type=bbl.CoalesceMode.PLAYER_SEASON).groupby('yr').sum()

Unnamed: 0_level_0,w,l,g,gs,cg,sho,sv,ip_outs,h,er,...,ibb,wp,hbp,bk,bfp,gf,r,sh,sf,gidp
yr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1980,0,0,5,0,0,0,0,13,8,5,...,0.0,0,0.0,0,24.0,4,5,0.0,0.0,0.0
1981,0,0,1,0,0,0,0,3,3,3,...,0.0,0,0.0,0,8.0,1,3,0.0,0.0,0.0
1983,0,0,4,0,0,0,0,24,14,8,...,0.0,1,2.0,0,45.0,3,8,0.0,0.0,0.0
1984,0,0,6,0,0,0,0,19,8,6,...,0.0,0,0.0,0,30.0,6,6,0.0,1.0,1.0
1985,0,0,2,0,0,0,0,6,1,0,...,0.0,0,0.0,0,7.0,2,0,0.0,0.0,0.0
1986,0,0,8,0,0,0,0,29,12,8,...,0.0,2,0.0,0,47.0,8,9,0.0,0.0,1.0
1987,0,0,11,0,0,0,0,37,14,9,...,0.0,3,1.0,1,57.0,11,9,0.0,0.0,2.0
1988,0,1,9,0,0,0,0,45,12,4,...,2.0,3,0.0,0,65.0,9,4,0.0,0.0,0.0
1989,0,1,12,0,0,0,0,36,17,14,...,1.0,1,2.0,2,65.0,11,15,0.0,1.0,3.0
1990,0,0,9,0,0,0,0,28,15,8,...,0.0,1,0.0,0,53.0,8,9,0.0,0.0,3.0


In [18]:
# Remove Ohtani, Kieschnick, Ankiel

two_way_players = ['kiescbr01', 'ankieri01', 'ohtansh01']

totals = bbl.load_pitching(player_types=bbl.PlayerType.POSITION, years=yrs, coalesce_type=bbl.CoalesceMode.PLAYER_SEASON) \
    .query('player_id not in @two_way_players') \
    .groupby('yr').sum().drop(['era'], axis='columns')

def add_rate_stats_pitching(df):
    totals = df
    totals['era'] = totals['er']*27/totals['ip_outs']
    for stat in ['so', 'bb', 'hr']:
        totals[f'{stat}%'] = totals[stat]/totals['bfp']
    return totals

add_rate_stats_pitching(totals)


Unnamed: 0_level_0,w,l,g,gs,cg,sho,sv,ip_outs,h,er,...,bfp,gf,r,sh,sf,gidp,era,so%,bb%,hr%
yr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1980,0,0,5,0,0,0,0,13,8,5,...,24.0,4,5,0.0,0.0,0.0,10.384615,0.041667,0.125,0.041667
1981,0,0,1,0,0,0,0,3,3,3,...,8.0,1,3,0.0,0.0,0.0,27.0,0.0,0.25,0.0
1983,0,0,4,0,0,0,0,24,14,8,...,45.0,3,8,0.0,0.0,0.0,9.0,0.066667,0.133333,0.066667
1984,0,0,6,0,0,0,0,19,8,6,...,30.0,6,6,0.0,1.0,1.0,8.526316,0.066667,0.133333,0.066667
1985,0,0,2,0,0,0,0,6,1,0,...,7.0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1986,0,0,8,0,0,0,0,29,12,8,...,47.0,8,9,0.0,0.0,1.0,7.448276,0.12766,0.148936,0.042553
1987,0,0,11,0,0,0,0,37,14,9,...,57.0,11,9,0.0,0.0,2.0,6.567568,0.105263,0.122807,0.017544
1988,0,1,9,0,0,0,0,45,12,4,...,65.0,9,4,0.0,0.0,0.0,2.4,0.061538,0.123077,0.015385
1989,0,1,12,0,0,0,0,36,17,14,...,65.0,11,15,0.0,1.0,3.0,10.5,0.061538,0.2,0.030769
1990,0,0,9,0,0,0,0,28,15,8,...,53.0,8,9,0.0,0.0,3.0,7.714286,0.037736,0.226415,0.0


In [19]:
modern_totals = add_rate_stats_pitching(totals.query('yr>=2014').sum())
modern_totals

w             1.000000
l             6.000000
g           302.000000
gs            1.000000
cg            0.000000
sho           0.000000
sv            1.000000
ip_outs     899.000000
h           418.000000
er          278.000000
hr          103.000000
bb          140.000000
so           94.000000
ba_opp       60.618000
ibb           3.000000
wp           15.000000
hbp          29.000000
bk            3.000000
bfp        1464.000000
gf          273.000000
r           293.000000
sh            0.000000
sf           17.000000
gidp         28.000000
era           8.349277
so%           0.064208
bb%           0.095628
hr%           0.070355
dtype: float64

In [20]:
# Who has benefitted the most.  Have any players or teams been lucky enough to face a lot of position players?

ev = bbl.load_event_data(2014, 2021, requested_columns=['bat_id', 'pit_id'])
ev.sample(10)

Unnamed: 0,game_id,date,game_type,tb_ct,bat_id,ab_fl,bat_event_fl,pit_id,event_cd,h_fl,ob_fl,yr
415098,KCA201605030,2016-05-03,RS,1,taylm002,True,True,younc003,20,1,1,2016
1040706,MIL201906080,2019-06-08,RS,1,mousm001,True,True,lylej001,20,1,1,2019
363372,TEX201509170,2015-09-17,RS,0,deshd002,True,True,mccul002,2,0,0,2015
303576,NYA201507230,2015-07-23,RS,1,gregd001,True,True,roe-c001,20,1,1,2015
1144369,COL201909160,2019-09-16,RS,0,nimmb001,True,True,diazj006,3,0,0,2019
1065620,SDN201906300,2019-06-30,RS,0,oneit001,True,True,wiecb001,3,0,0,2019
826980,CIN201805200,2018-05-20,RS,0,mahlt001,True,True,darvy001,3,0,0,2018
475484,SFN201606280,2016-06-28,RS,0,beltb001,True,True,gravk001,2,0,0,2016
663227,MIA201706210,2017-06-21,RS,1,drews001,True,True,garcj006,20,1,1,2017
1217373,MIA202009160,2020-09-16,RS,0,chisj001,True,True,mazzc002,3,0,0,2020


In [21]:

non_p = bbl.get_non_pitchers()
ppl = bbl.load_people()
non_p_retro = ppl.query('player_id in @non_p  and player_id not in @two_way_players')['retro_id'].values
non_p_retro

array(['aaroh101', 'aarot101', 'abada001', ..., 'zupof101', 'zuvep001',
       'zwild101'], dtype=object)

In [22]:
ppp_pa = ev.query('pit_id in @non_p_retro')
ppp_pa

Unnamed: 0,game_id,date,game_type,tb_ct,bat_id,ab_fl,bat_event_fl,pit_id,event_cd,h_fl,ob_fl,yr
16558,CHA201404160,2014-04-16,RS,0,sizeg001,True,True,garcl004,2,0,0,2014
16559,CHA201404160,2014-04-16,RS,0,piera001,True,True,garcl004,2,0,0,2014
16560,CHA201404160,2014-04-16,RS,0,navad002,False,True,garcl004,14,0,1,2014
16561,CHA201404160,2014-04-16,RS,0,herrj002,False,True,garcl004,14,0,1,2014
16562,CHA201404160,2014-04-16,RS,2,bradj001,True,True,garcl004,21,1,1,2014
...,...,...,...,...,...,...,...,...,...,...,...,...
1226108,BOS202009240,2020-09-24,RS,0,holab001,True,True,lin-t001,2,0,0,2020
1226647,CHA202009250,2020-09-25,RS,4,contw001,True,True,sancc001,23,1,1,2020
1226648,CHA202009250,2020-09-25,RS,0,heywj001,True,True,sancc001,2,0,0,2020
1226649,CHA202009250,2020-09-25,RS,2,baezj001,True,True,sancc001,21,1,1,2020


In [23]:
bsu.summarize_events(ppp_pa, 'bat_id').sort_values(by='pa', ascending=False).head(20)

Unnamed: 0_level_0,pa,ob,ab,h,tb,k,bb,ibb,hr,ba,obp,slg,woba,k%,bb%,hr%
bat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
pindc001,10,4,8,2,6,0,1,0,1,0.25,0.4,0.75,0.4608,0.0,0.1,0.1
gatte001,9,4,9,4,7,0,0,0,1,0.444444,0.444444,0.777778,0.506444,0.0,0.0,0.111111
marij002,9,3,9,3,7,0,0,0,1,0.333333,0.333333,0.777778,0.451778,0.0,0.0,0.111111
pedej001,9,4,7,2,5,0,2,0,1,0.285714,0.444444,0.714286,0.470889,0.0,0.222222,0.111111
barna001,9,5,6,2,6,2,2,0,1,0.333333,0.555556,1.0,0.59,0.222222,0.222222,0.111111
bellc002,8,2,8,2,5,0,0,0,1,0.25,0.25,0.625,0.35425,0.0,0.0,0.125
fowld001,8,2,6,0,0,0,2,0,0,0.0,0.25,0.0,0.1755,0.0,0.25,0.0
hamib001,8,4,6,3,5,0,1,0,0,0.5,0.5,0.833333,0.5035,0.0,0.125,0.0
heywj001,8,3,6,1,1,0,2,0,0,0.166667,0.375,0.166667,0.28325,0.0,0.25,0.0
herne001,8,3,8,3,4,0,0,0,0,0.375,0.375,0.5,0.3695,0.0,0.0,0.0


In [26]:
df = bbl.get_event_code_descriptions()
df

Unnamed: 0_level_0,description
code,Unnamed: 1_level_1
0,unknown
1,
2,Generic out
3,Strikeout
4,Stolen base
5,Defensive indifference
6,Caught stealing
7,Pickoff error
8,Pickoff
9,Wild pitch
