# This notebook prepares the primary data sets and visits some tips and techniques along the way

In [1]:
import pandas as pd
import os
import matplotlib

# % and %% are ipthyon 'magics'
# ! is ipython's shell execution shortcut

In [None]:
%history

In [3]:
!ls data_public/Jupyter/*.EV*

data_public/Jupyter/2017ANA.EVA data_public/Jupyter/2017MIL.EVN
data_public/Jupyter/2017ARI.EVN data_public/Jupyter/2017MIN.EVA
data_public/Jupyter/2017ATL.EVN data_public/Jupyter/2017NYA.EVA
data_public/Jupyter/2017BAL.EVA data_public/Jupyter/2017NYN.EVN
data_public/Jupyter/2017BOS.EVA data_public/Jupyter/2017OAK.EVA
data_public/Jupyter/2017CHA.EVA data_public/Jupyter/2017PHI.EVN
data_public/Jupyter/2017CHN.EVN data_public/Jupyter/2017PIT.EVN
data_public/Jupyter/2017CIN.EVN data_public/Jupyter/2017SDN.EVN
data_public/Jupyter/2017CLE.EVA data_public/Jupyter/2017SEA.EVA
data_public/Jupyter/2017COL.EVN data_public/Jupyter/2017SFN.EVN
data_public/Jupyter/2017DET.EVA data_public/Jupyter/2017SLN.EVN
data_public/Jupyter/2017HOU.EVA data_public/Jupyter/2017TBA.EVA
data_public/Jupyter/2017KCA.EVA data_public/Jupyter/2017TEX.EVA
data_public/Jupyter/2017LAN.EVN data_public/Jupyter/2017TOR.EVA
data_public/Jupyter/2017MIA.EVN data_public/Jupyter/2017WAS.EVN


### Here we're preparing the file.  Run the cwevent executable with -n and capture the header

In [4]:
#! cwevent -n data_public/Jupyter/2017SEA.EVA |head -1 >data_public/Jupyter/atbats.txt

### Now we'll shell out and run a loop to invoke the converter on each event file.  We'll also concatentate the roster files in a separate command

In [5]:
#!for x in $(ls -1 data_public/Jupyter/*.EV*); do cwevent $x >>data_public/Jupyter/atbats.txt; done
#!cat data_public/Jupyter/*.ROS >data_public/Jupyter/rosters.txt#

#### Useful techniques for interacting with the shell

* Use assignment to capture the ouput of your ! command

In [6]:
files = !ls

In [7]:
type(files)

IPython.utils.text.SList

In [8]:
files.grep('\.i.*')

['Baseball.ipynb',
 'Introl to Jupyter.ipynb',
 'Jupyter-intro-data.ipynb',
 'Jupyter-intro.ipynb',
 'Mxw_cmp_Active_not_Sync.ipynb',
 'Untitled.ipynb',
 'crw_stage_for_deletion.ipynb',
 'example_json.ipynb']

* Use {expression} to pass from ipython to the shell

In [9]:
var = 'ipynbb'

In [10]:
!ls *.{var}

ls: *.ipynbb: No such file or directory


In [11]:
!ls *.{var[0:-1]}

Baseball.ipynb                Mxw_cmp_Active_not_Sync.ipynb
Introl to Jupyter.ipynb       Untitled.ipynb
Jupyter-intro-data.ipynb      crw_stage_for_deletion.ipynb
Jupyter-intro.ipynb           example_json.ipynb


In [12]:
df_atbats=pd.read_csv('./data_public/Jupyter/atbats.txt')
df_players=pd.read_csv('./data_public/Jupyter/players.txt')

In [13]:
df_atbats.shape

(191196, 36)

In [14]:
df_atbats.head()

Unnamed: 0,GAME_ID,AWAY_TEAM_ID,INN_CT,BAT_HOME_ID,OUTS_CT,BALLS_CT,STRIKES_CT,AWAY_SCORE_CT,HOME_SCORE_CT,RESP_BAT_ID,...,SF_FL,EVENT_OUTS_CT,RBI_CT,WP_FL,PB_FL,ERR_CT,BAT_DEST_ID,RUN1_DEST_ID,RUN2_DEST_ID,RUN3_DEST_ID
0,ANA201704070,SEA,1,0,0,3,2,0,0,seguj002,...,F,0,0,F,F,0,1,0,0,0
1,ANA201704070,SEA,1,0,0,1,2,0,0,hanim001,...,F,1,0,F,F,0,0,1,0,0
2,ANA201704070,SEA,1,0,1,1,1,0,0,canor001,...,F,1,0,F,F,0,0,1,0,0
3,ANA201704070,SEA,1,0,2,0,1,0,0,cruzn002,...,F,0,0,F,F,0,0,2,0,0
4,ANA201704070,SEA,1,0,2,2,2,0,0,cruzn002,...,F,1,0,F,F,0,0,0,2,0


In [15]:
df_atbats['AWAY_TEAM_ID'].value_counts()/162/9

TOR    4.512346
NYN    4.475995
HOU    4.472565
MIA    4.471879
NYA    4.466392
CIN    4.457476
ATL    4.439643
BOS    4.403292
KCA    4.385460
SEA    4.384088
SDN    4.379287
SLN    4.373800
SFN    4.362826
TBA    4.361454
CHA    4.358711
COL    4.357339
MIL    4.355967
CHN    4.355967
LAN    4.338820
MIN    4.338820
PHI    4.336763
ANA    4.329218
PIT    4.320302
DET    4.305898
OAK    4.305213
WAS    4.304527
ARI    4.303841
CLE    4.297668
BAL    4.296982
TEX    4.283265
Name: AWAY_TEAM_ID, dtype: float64

In [16]:
type(df_atbats['AWAY_TEAM_ID'].value_counts())

pandas.core.series.Series

In [17]:
df_atbats['AWAY_TEAM_ID'].value_counts().index

Index(['TOR', 'NYN', 'HOU', 'MIA', 'NYA', 'CIN', 'ATL', 'BOS', 'KCA', 'SEA',
       'SDN', 'SLN', 'SFN', 'TBA', 'CHA', 'COL', 'MIL', 'CHN', 'LAN', 'MIN',
       'PHI', 'ANA', 'PIT', 'DET', 'OAK', 'WAS', 'ARI', 'CLE', 'BAL', 'TEX'],
      dtype='object')

In [18]:
filters = {}
for team in df_atbats['AWAY_TEAM_ID'].value_counts().index:
   filters[team] = (df_atbats['GAME_ID'].str.startswith(team)) | (df_atbats['AWAY_TEAM_ID'] == team)

In [19]:
flt_homers = df_atbats['BAT_DEST_ID'] == 4
flt_redsox = (df_atbats['GAME_ID'].str.startswith('BOS')) | (df_atbats['AWAY_TEAM_ID'] == 'BOS')
flt_yankees = (df_atbats['GAME_ID'].str.startswith('NYA')) | (df_atbats['AWAY_TEAM_ID'] == 'NYA')

In [20]:
df_atbats[flt_homers].shape

(6058, 36)

In [21]:
df_atbats.shape

(191196, 36)

In [22]:
len(df_atbats.GAME_ID.value_counts())

2430

In [23]:
df_atbats[flt_redsox]['GAME_ID'].shape

(12962,)

In [24]:
df_atbats[flt_redsox & flt_homers]['BAT_HOME_ID'].value_counts()

0    184
1    176
Name: BAT_HOME_ID, dtype: int64

In [25]:
for filter in filters:
    print(filter)
    print(df_atbats[filters[filter] & flt_homers]['BAT_HOME_ID'].value_counts())

TOR
0    213
1    209
Name: BAT_HOME_ID, dtype: int64
NYN
1    222
0    219
Name: BAT_HOME_ID, dtype: int64
HOU
0    220
1    205
Name: BAT_HOME_ID, dtype: int64
MIA
1    209
0    177
Name: BAT_HOME_ID, dtype: int64
NYA
1    226
0    202
Name: BAT_HOME_ID, dtype: int64
CIN
1    238
0    228
Name: BAT_HOME_ID, dtype: int64
ATL
0    184
1    171
Name: BAT_HOME_ID, dtype: int64
BOS
0    184
1    176
Name: BAT_HOME_ID, dtype: int64
KCA
1    198
0    189
Name: BAT_HOME_ID, dtype: int64
SEA
1    217
0    214
Name: BAT_HOME_ID, dtype: int64
SDN
1    222
0    192
Name: BAT_HOME_ID, dtype: int64
SLN
1    189
0    188
Name: BAT_HOME_ID, dtype: int64
SFN
1    161
0    146
Name: BAT_HOME_ID, dtype: int64
TBA
0    213
1    206
Name: BAT_HOME_ID, dtype: int64
CHA
0    216
1    209
Name: BAT_HOME_ID, dtype: int64
COL
1    201
0    179
Name: BAT_HOME_ID, dtype: int64
MIL
0    203
1    202
Name: BAT_HOME_ID, dtype: int64
CHN
1    216
0    198
Name: BAT_HOME_ID, dtype: int64
LAN
1    210
0    194
Name: 

In [26]:
df_atbats['GAME_ID'].str.startswith('BOS').value_counts()

False    184654
True       6542
Name: GAME_ID, dtype: int64

In [27]:
df_atbats.columns

Index(['GAME_ID', 'AWAY_TEAM_ID', 'INN_CT', 'BAT_HOME_ID', 'OUTS_CT',
       'BALLS_CT', 'STRIKES_CT', 'AWAY_SCORE_CT', 'HOME_SCORE_CT',
       'RESP_BAT_ID', 'RESP_BAT_HAND_CD', 'RESP_PIT_ID', 'RESP_PIT_HAND_CD',
       'BASE1_RUN_ID', 'BASE2_RUN_ID', 'BASE3_RUN_ID', 'EVENT_TX',
       'LEADOFF_FL', 'PH_FL', 'BAT_FLD_CD', 'BAT_LINEUP_ID', 'EVENT_CD',
       'BAT_EVENT_FL', 'AB_FL', 'H_CD', 'SH_FL', 'SF_FL', 'EVENT_OUTS_CT',
       'RBI_CT', 'WP_FL', 'PB_FL', 'ERR_CT', 'BAT_DEST_ID', 'RUN1_DEST_ID',
       'RUN2_DEST_ID', 'RUN3_DEST_ID'],
      dtype='object')

In [28]:
grp_teams = df_atbats.groupby([df_atbats['GAME_ID'].str[0:3], df_atbats['AWAY_TEAM_ID']])

In [29]:
grp_teams['EVENT_CD'].count()

GAME_ID  AWAY_TEAM_ID
ANA      ATL             217
         BAL             213
         BOS             217
         CHA             252
         CLE             227
         DET             300
         HOU             660
         KCA             302
         LAN             147
         MIN             308
         NYA             263
         OAK             722
         PHI             229
         SEA             680
         TBA             232
         TEX             783
         TOR             339
         WAS             139
ARI      ATL             259
         CHA             212
         CHN             244
         CIN             227
         CLE             229
         COL             794
         DET             156
         HOU             151
         LAN             703
         MIA             260
         MIL             242
         NYN             240
                        ... 
TOR      CLE             222
         DET             228
         HOU         

In [30]:
grp_away = df_atbats.groupby(df_atbats['AWAY_TEAM_ID'])
grp_home = df_atbats.groupby(df_atbats['GAME_ID'].str[0:3])

In [31]:
df_atbats[df_atbats['EVENT_CD'] == 23].shape

(6105, 36)

In [32]:
%matplotlib inline

In [39]:
grp_home['EVENT_CD'].head()

0         14
1          3
2          2
3          4
4          3
6230       2
6231      14
6232      20
6233       3
6234       2
12669      2
12670     14
12671      4
12672      2
12673      2
19109     20
19110      2
19111      2
19112      2
19113      2
25580      2
25581      2
25582      2
25583      2
25584      3
32122      2
32123      2
32124     14
32125      2
32126     20
          ..
152998     2
152999     2
153000     2
153001    21
153002     2
159431    20
159432     3
159433    20
159434     2
159435     2
165758     2
165759     2
165760    14
165761     2
165762    20
172048     2
172049    14
172050     3
172051     3
172052    14
178578     2
178579    23
178580     2
178581    22
178582    20
184826     2
184827     2
184828     3
184829    21
184830    14
Name: EVENT_CD, Length: 150, dtype: int64

In [34]:
df_atbats[filters['NYA'

SyntaxError: unexpected EOF while parsing (<ipython-input-34-3399975dff36>, line 1)

In [None]:
for f in var:
    print(f)
    

In [None]:
type(var)

In [None]:
var.n

In [None]:
type(var.p)

In [None]:
var.grep('^r.*')

In [None]:
flt = '*.ip*'

In [None]:
%ls {flt}