# This notebook prepares the primary data sets and visits some tips and techniques along the way

* Full data pulled from retrosheet.org
* Chadwick tools used for converting retrosheet data http://chadwick.sourceforge.net/doc/cwtools.html

In [None]:
import pandas as pd
import os
import matplotlib

# % and %% are ipthyon 'magics'
# ! is ipython's shell execution shortcut

In [None]:
%history

In [None]:
!ls data_public/Jupyter/*.EV*

### Here we're preparing the file.  Run the cwevent executable with -n and capture the header

In [None]:
#! cwevent -n data_public/Jupyter/2017SEA.EVA |head -1 >data_public/Jupyter/atbats.txt

### Now we'll shell out and run a loop to invoke the converter on each event file.  We'll also concatentate the roster files in a separate command

In [None]:
#!for x in $(ls -1 data_public/Jupyter/*.EV*); do cwevent $x >>data_public/Jupyter/atbats.txt; done
#!cat data_public/Jupyter/*.ROS >data_public/Jupyter/rosters.txt#

#### Useful techniques for interacting with the shell

* Use assignment to capture the ouput of your ! command

In [None]:
files = !ls

In [None]:
type(files)

In [None]:
files.grep('\.i.*')

* Use {expression} to pass from ipython to the shell

In [None]:
extension = 'ipynbb'

In [None]:
!ls *.{extension}

* Hmm... Extra trailing 'b'. Lets take a slice of the extension string

In [None]:
!ls *.{extension[0:-1]}

In [None]:
df_atbats=pd.read_csv('./data_public/Jupyter/atbats.txt')
df_players=pd.read_csv('./data_public/Jupyter/players.txt')

In [None]:
df_atbats.shape

In [None]:
df_atbats.head()

In [143]:
df_atbats['GAME_ID'].str[0:3].value_counts()/162/9

DET    4.490398
BOS    4.486968
TEX    4.478738
MIN    4.475309
CHN    4.446502
BAL    4.438272
ATL    4.417010
ARI    4.416324
OAK    4.413580
CIN    4.412209
SFN    4.412209
MIL    4.407407
COL    4.403292
PIT    4.397119
PHI    4.394376
MIA    4.391632
WAS    4.368999
NYN    4.368313
CHA    4.345679
SLN    4.339506
SEA    4.338134
NYA    4.334705
HOU    4.327846
TBA    4.314129
TOR    4.285322
KCA    4.282579
ANA    4.272977
CLE    4.242112
LAN    4.231139
SDN    4.203018
Name: GAME_ID, dtype: float64

In [None]:
type(df_atbats['AWAY_TEAM_ID'].value_counts())

In [None]:
df_atbats['AWAY_TEAM_ID'].value_counts().index

In [None]:
filters = {}
for team in df_atbats['AWAY_TEAM_ID'].value_counts().index:
   filters[team] = (df_atbats['GAME_ID'].str.startswith(team)) | (df_atbats['AWAY_TEAM_ID'] == team)

In [107]:
flt_homers = df_atbats['EVENT_CD'] == 23
flt_redsox = (df_atbats['GAME_ID'].str.startswith('BOS')) | (df_atbats['AWAY_TEAM_ID'] == 'BOS')
flt_yankees = (df_atbats['GAME_ID'].str.startswith('NYA')) | (df_atbats['AWAY_TEAM_ID'] == 'NYA')

In [140]:
df_atbats[(filters['BOS']) & (~df_atbats['GAME_ID'].str.startswith('BOS'))]['EVENT_CD'].value_counts()[13]

1

In [137]:
df_atbats[(filters['BOS']) & (df_atbats['EVENT_CD'] == 13)].shape

(7, 36)

In [78]:
df_atbats.columns

Index(['GAME_ID', 'AWAY_TEAM_ID', 'INN_CT', 'BAT_HOME_ID', 'OUTS_CT',
       'BALLS_CT', 'STRIKES_CT', 'AWAY_SCORE_CT', 'HOME_SCORE_CT',
       'RESP_BAT_ID', 'RESP_BAT_HAND_CD', 'RESP_PIT_ID', 'RESP_PIT_HAND_CD',
       'BASE1_RUN_ID', 'BASE2_RUN_ID', 'BASE3_RUN_ID', 'EVENT_TX',
       'LEADOFF_FL', 'PH_FL', 'BAT_FLD_CD', 'BAT_LINEUP_ID', 'EVENT_CD',
       'BAT_EVENT_FL', 'AB_FL', 'H_CD', 'SH_FL', 'SF_FL', 'EVENT_OUTS_CT',
       'RBI_CT', 'WP_FL', 'PB_FL', 'ERR_CT', 'BAT_DEST_ID', 'RUN1_DEST_ID',
       'RUN2_DEST_ID', 'RUN3_DEST_ID'],
      dtype='object')

In [None]:
len(df_atbats.GAME_ID.value_counts())

In [None]:
df_atbats[flt_redsox]['GAME_ID'].shape

In [None]:
df_atbats[flt_redsox & flt_homers]['BAT_HOME_ID'].value_counts()

In [None]:
for filter in filters:
    print(filter)
    print(df_atbats[filters[filter] & flt_homers]['BAT_HOME_ID'].value_counts())

In [79]:
df_atbats['GAME_ID'].str.startswith('BOS').value_counts()

False    184654
True       6542
Name: GAME_ID, dtype: int64

In [110]:
grp_teams_homers = df_atbats.groupby([df_atbats['GAME_ID'].str[0:3], df_atbats['AWAY_TEAM_ID'], df_atbats['EVENT_CD']==23])

In [113]:
grp_teams_homers['EVENT_CD'].count()

GAME_ID  AWAY_TEAM_ID  EVENT_CD
ANA      ATL           False       209
                       True          8
         BAL           False       208
                       True          5
         BOS           False       211
                       True          6
         CHA           False       244
                       True          8
         CLE           False       220
                       True          7
         DET           False       290
                       True         10
         HOU           False       641
                       True         19
         KCA           False       293
                       True          9
         LAN           False       141
                       True          6
         MIN           False       295
                       True         13
         NYA           False       256
                       True          7
         OAK           False       700
                       True         22
         PHI           False    

In [None]:
grp_away = df_atbats.groupby(df_atbats['AWAY_TEAM_ID'])
grp_home = df_atbats.groupby(df_atbats['GAME_ID'].str[0:3])

In [96]:
grp_away[df_atbats[df_atbats['EVENT_CD'] == 23]].count()

TypeError: 'DataFrame' objects are mutable, thus they cannot be hashed

In [99]:
df_atbats[df_atbats['EVENT_CD'] == 23]['EVENT_CD']

47        23
57        23
74        23
112       23
131       23
159       23
167       23
200       23
219       23
252       23
256       23
289       23
305       23
327       23
330       23
342       23
371       23
397       23
421       23
480       23
525       23
594       23
633       23
666       23
673       23
730       23
732       23
743       23
776       23
883       23
          ..
190136    23
190146    23
190198    23
190226    23
190250    23
190258    23
190280    23
190288    23
190291    23
190293    23
190321    23
190416    23
190446    23
190479    23
190561    23
190625    23
190654    23
190659    23
190665    23
190733    23
190758    23
190848    23
190865    23
190869    23
190949    23
190998    23
191016    23
191017    23
191107    23
191167    23
Name: EVENT_CD, Length: 6105, dtype: int64

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23]['RESP_BAT_ID'].value_counts()

In [None]:
%matplotlib inline

In [None]:
grp_home['EVENT_CD'].head()

In [None]:
df_atbats[filters['NYA']

In [None]:
grp_hit_type=df_atbats.groupby('EVENT_CD')

In [None]:
grp_hit_type.describe()

In [None]:
for grpname,grprec in grp_hit_type:
    print(grpname)
    print(grprec)

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23].groupby([df_atbats['EVENT_CD'] == 23]['EVENT_CD'])

In [None]:
df_atbats[df_atbats['EVENT_CD' == 23]].groupby('BAT_HOME_ID').min()

In [None]:
for f in var:
    print(f)
    

In [None]:
type(var)

In [None]:
var.n

In [None]:
type(var.p)

In [None]:
var.grep('^r.*')

In [None]:
flt = '*.ip*'

In [None]:
%ls {flt}