## How often do position players pitch?

Can we identify a dividing line to separate position players from pitchers?

In [1]:
import pandas as pd
import plotly.express as px

import boxball_loader as bbl
import baseball_stats_utils as bsu

In [2]:
# Group appearances by player
# See how often players appear as pitchers vs non-pitchers

app = bbl.load_appearances().query('year_id >= 2010')

In [3]:
app.columns

Index(['year_id', 'team_id', 'lg_id', 'player_id', 'g_all', 'gs', 'g_batting',
       'g_defense', 'g_p', 'g_c', 'g_1b', 'g_2b', 'g_3b', 'g_ss', 'g_lf',
       'g_cf', 'g_rf', 'g_of', 'g_dh', 'g_ph', 'g_pr'],
      dtype='object')

In [4]:
pit = app.groupby('player_id')[['g_all', 'g_defense', 'g_p']].sum()
pit

Unnamed: 0_level_0,g_all,g_defense,g_p
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aardsda01,130,130.0,130
abadfe01,384,384.0,384
abreual01,2,2.0,2
abreubo01,474,247.0,0
abreubr01,11,11.0,11
...,...,...,...
zobribe01,1354,1298.0,1
zuberty01,23,23.0,23
zumayjo01,31,31.0,31
zuninmi01,705,692.0,0


In [5]:
pit['g_np'] = pit['g_all']-pit['g_p']
pit

Unnamed: 0_level_0,g_all,g_defense,g_p,g_np
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aardsda01,130,130.0,130,0
abadfe01,384,384.0,384,0
abreual01,2,2.0,2,0
abreubo01,474,247.0,0,474
abreubr01,11,11.0,11,0
...,...,...,...,...
zobribe01,1354,1298.0,1,1353
zuberty01,23,23.0,23,0
zumayjo01,31,31.0,31,0
zuninmi01,705,692.0,0,705


In [6]:
#both = pit.loc[pit['g_np']>0]
both = pit
both

Unnamed: 0_level_0,g_all,g_defense,g_p,g_np
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aardsda01,130,130.0,130,0
abadfe01,384,384.0,384,0
abreual01,2,2.0,2,0
abreubo01,474,247.0,0,474
abreubr01,11,11.0,11,0
...,...,...,...,...
zobribe01,1354,1298.0,1,1353
zuberty01,23,23.0,23,0
zumayjo01,31,31.0,31,0
zuninmi01,705,692.0,0,705


In [7]:
both['p_p']= both['g_p']/both['g_all']
both

Unnamed: 0_level_0,g_all,g_defense,g_p,g_np,p_p
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
aardsda01,130,130.0,130,0,1.000000
abadfe01,384,384.0,384,0,1.000000
abreual01,2,2.0,2,0,1.000000
abreubo01,474,247.0,0,474,0.000000
abreubr01,11,11.0,11,0,1.000000
...,...,...,...,...,...
zobribe01,1354,1298.0,1,1353,0.000739
zuberty01,23,23.0,23,0,1.000000
zumayjo01,31,31.0,31,0,1.000000
zuninmi01,705,692.0,0,705,0.000000


In [8]:
# What's the distribution of percentages
px.line(both['p_p'].sort_values())

OK, so almost all pitchers are at one end or the other.  There's nobody between 9% and 62%.

In [9]:

both['p_p'].describe()

count    3775.000000
mean        0.556126
std         0.494382
min         0.000000
25%         0.000000
50%         0.995098
75%         1.000000
max         1.000000
Name: p_p, dtype: float64

In [10]:
# Let's look at the players closest to the middle
both.query('.05 < p_p < .9').sort_values('p_p')


Unnamed: 0_level_0,g_all,g_defense,g_p,g_np,p_p
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
espinsa01,27,24.0,2,25,0.074074
walshja01,66,57.0,5,61,0.075758
annade01,13,12.0,1,12,0.076923
schroma01,11,7.0,1,10,0.090909
guerrja02,35,30.0,22,13,0.628571
loewead01,42,36.0,28,14,0.666667
mckaybr01,18,13.0,13,5,0.722222
smithgr02,10,8.0,8,2,0.8
lorenmi01,325,292.0,268,57,0.824615
cookaa01,69,59.0,59,10,0.855072


In [11]:
both['player_name'] = bsu.get_player_names_df(both, 'player_id')
both

Unnamed: 0_level_0,g_all,g_defense,g_p,g_np,p_p,player_name
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aardsda01,130,130.0,130,0,1.000000,David Aardsma
abadfe01,384,384.0,384,0,1.000000,Fernando Abad
abreual01,2,2.0,2,0,1.000000,Albert Abreu
abreubo01,474,247.0,0,474,0.000000,Bobby Abreu
abreubr01,11,11.0,11,0,1.000000,Bryan Abreu
...,...,...,...,...,...,...
zobribe01,1354,1298.0,1,1353,0.000739,Ben Zobrist
zuberty01,23,23.0,23,0,1.000000,Tyler Zuber
zumayjo01,31,31.0,31,0,1.000000,Joel Zumaya
zuninmi01,705,692.0,0,705,0.000000,Mike Zunino


In [12]:
# How often do position players pitch?
# boxball_loader divides position players from pitchers at 50% of games played

ppp = bbl.load_pitching(player_types=bbl.PlayerType.POSITION)
ppp.shape

(1466, 31)

In [13]:
totals = ppp.groupby('yr').sum()
totals

Unnamed: 0_level_0,stint,w,l,g,gs,cg,sho,sv,ip_outs,h,...,ibb,wp,hbp,bk,bfp,gf,r,sh,sf,gidp
yr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1871,10,15,21,58,35,33,0,4,1075,617,...,0.0,65,0.0,0,1946.0,17,557,0.0,0.0,0.0
1872,8,9,13,39,21,18,0,5,747,396,...,0.0,15,0.0,0,1281.0,16,321,0.0,0.0,0.0
1873,11,4,13,39,18,10,0,4,572,353,...,0.0,4,0.0,1,1047.0,18,283,0.0,0.0,0.0
1874,8,6,24,42,29,23,0,3,812,383,...,0.0,15,0.0,0,1332.0,10,288,0.0,0.0,0.0
1875,29,46,103,200,152,120,4,8,4151,1693,...,0.0,135,0.0,3,6397.0,38,1280,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016,24,0,1,26,0,0,0,0,74,31,...,1.0,2,2.0,0,114.0,24,15,0.0,0.0,1.0
2017,24,0,0,36,0,0,0,0,98,48,...,0.0,3,2.0,1,168.0,32,37,0.0,5.0,3.0
2018,52,4,4,75,10,0,0,0,344,138,...,0.0,8,7.0,2,530.0,55,102,0.0,3.0,9.0
2019,57,0,1,90,0,0,0,1,280,129,...,1.0,2,9.0,0,454.0,85,88,0.0,5.0,11.0


In [14]:
# What's the distribution over time?
px.bar(totals['g'])

In [15]:
# Let's look at those games started; those are suspicious

ppp.query('gs>0 and yr>=2010')

Unnamed: 0,player_id,yr,stint,team_id,lg_id,w,l,g,gs,cg,...,wp,hbp,bk,bfp,gf,r,sh,sf,gidp,franch_id
1255,laneja01,2014,1,SDN,NL,0,1,3,1,0,...,0,0.0,0,39.0,0,1,0.0,0.0,0.0,SDP
1333,ohtansh01,2018,1,LAA,AL,4,2,10,10,0,...,5,1.0,0,211.0,0,19,0.0,1.0,2.0,ANA
1442,ohtansh01,2020,1,LAA,AL,0,1,2,2,0,...,1,0.0,0,16.0,0,7,0.0,0.0,0.0,ANA


Ohtani doesn't meet the threshold of pitching in 50% of the games, so he is counted as a position player.  For most purposes (e.g. when we're looking at hitting stats), that makes sense.  Ohtani doesn't really fit into the binary split of pitcher vs. position player.  For our purposes today, we should probably exclude him.  Are there other two-way players?

In [16]:
bbl.load_pitching(player_types=bbl.PlayerType.POSITION, years=range(2010,3000), coalesce_type=bbl.CoalesceMode.PLAYER_CAREER) \
    .sort_values('g', ascending=False).head(20)

Unnamed: 0_level_0,w,l,g,gs,cg,sho,sv,ip_outs,h,er,...,ibb,wp,hbp,bk,bfp,gf,r,sh,sf,gidp
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ohtansh01,4,3,12,12,0,0,0,160,41,26,...,0.0,6,1.0,0,227.0,0,26,0.0,1.0,2.0
gimench01,0,0,11,0,0,0,0,33,19,16,...,0.0,0,0.0,0,53.0,11,16,0.0,0.0,0.0
kratzer01,0,0,7,0,0,0,0,21,10,4,...,0.0,2,1.0,0,32.0,7,6,0.0,1.0,1.0
buterdr01,0,0,7,0,0,0,0,18,8,5,...,0.0,1,0.0,0,30.0,7,5,0.0,0.0,0.0
rominan01,0,0,7,0,0,0,0,17,10,8,...,0.0,2,2.0,0,34.0,6,8,0.0,1.0,0.0
perezhe01,0,0,7,0,0,0,0,22,10,6,...,0.0,0,1.0,0,35.0,5,6,0.0,0.0,2.0
sucreje01,0,0,6,0,0,0,0,15,12,8,...,0.0,0,0.0,1,28.0,5,8,0.0,2.0,1.0
davidma02,0,0,6,0,0,0,0,19,5,2,...,0.0,0,0.0,0,26.0,6,2,0.0,0.0,1.0
bethach01,0,0,6,0,0,0,0,16,7,6,...,0.0,2,1.0,0,35.0,2,9,0.0,0.0,1.0
descada01,0,0,6,0,0,0,0,20,6,5,...,0.0,0,0.0,0,27.0,4,5,0.0,0.0,0.0
