# This notebook prepares the primary data sets and visits some tips and techniques along the way

* Full data pulled from retrosheet.org
* Chadwick tools used for converting retrosheet data http://chadwick.sourceforge.net/doc/cwtools.html

In [None]:
import pandas as pd
import os
import matplotlib

# % and %% are ipthyon 'magics'
# ! is ipython's shell execution shortcut

In [None]:
%history

In [None]:
!ls data_public/*.EV*

In [None]:
%%sh
head -3 data_public/2017CHA.EVA
echo ""
echo 'data_public/2017CHA.EVA'
echo ""
sort -k2 -t, data_public/2017CHA.EVA |head -5

### Here we're preparing the file.  Run the cwevent executable with -n and capture the header

In [None]:
#Chadwick expects a 'team' file in the cwd
!ln -s ./data_public/TEAM2017 team

In [None]:
! cwevent -n data_public/2017SEA.EVA |head -1 >data_public/atbats.txt

### Now we'll shell out and run a loop to invoke the converter on each event file.  We'll also concatentate the roster files in a separate command

In [None]:
%%sh
for x in $(ls -1 data_public/*.EV*); do cwevent $x >>data_public/atbats.txt; done
cat data_public/*.ROS >data_public/rosters.txt

### Standard Python file to dictionary
* Constants for event codes

In [None]:
event_code = {}
with open("data_public/event_codes.txt") as f:
    for line in f:
       (val, key) = line.split()
       event_codes[key] = int(val)

#### Useful techniques for interacting with the shell

* Use assignment to capture the ouput of your ! command

In [6]:
files = !ls

In [7]:
type(files)

IPython.utils.text.SList

In [8]:
%env

{'TERM_PROGRAM': 'iTerm.app',
 'TERM': 'xterm-color',
 'SHELL': '/bin/bash',
 'HISTSIZE': '',
 'TMPDIR': '/var/folders/8g/2d4q5l3x087dj8dbzm5z3qkm0000gn/T/',
 'Apple_PubSub_Socket_Render': '/private/tmp/com.apple.launchd.oCSKaKs53q/Render',
 'TERM_PROGRAM_VERSION': '3.2.3',
 'OLDPWD': '/Users/paulrodgers',
 'TERM_SESSION_ID': 'w0t0p0:27AE8970-6145-4337-88A3-406A74A0AACC',
 'LC_ALL': 'C',
 'HISTFILESIZE': '',
 'USER': 'paulrodgers',
 'SSH_AUTH_SOCK': '/private/tmp/com.apple.launchd.HA5RQ3IO8s/Listeners',
 '__CF_USER_TEXT_ENCODING': '0x1F5:0x0:0x0',
 'VIRTUAL_ENV': '/Users/paulrodgers/psr_dev/Salesforce/sfdc',
 'PATH': '/Users/paulrodgers/psr_dev/Salesforce/sfdc/bin:/Users/paulrodgers/psr_dev/Salesforce/sfdc/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/Library/TeX/texbin',
 'PWD': '/Users/paulrodgers/psr_dev/Salesforce',
 'LANG': 'en_US.UTF-8',
 'ITERM_PROFILE': 'Default',
 'XPC_FLAGS': '0x0',
 'ITERM_ORIG_PS1': '(sfdc) \\[\\e[33m\\]\\w\\[\\e[m\\] \\[\\]',
 'PS1': '(sfdc) \\[\\e[33m

In [None]:
files.grep('\.i.*')

* Use {expression} to pass from ipython to the shell

In [None]:
extension = 'ipynbb'

In [None]:
!ls *.{extension}

* Hmm... Extra trailing 'b'. Lets take a slice of the extension string

In [None]:
!ls *.{extension[0:-1]}

In [None]:
df_atbats=pd.read_csv('./data_public/atbats.txt')
df_players=pd.read_csv('./data_public/players.txt')

In [None]:
df_atbats.shape

In [None]:
df_atbats.head()

In [None]:
df_atbats['GAME_ID'].str[0:3].value_counts()/162/9

In [None]:
type(df_atbats['AWAY_TEAM_ID'].value_counts())

In [None]:
df_atbats['AWAY_TEAM_ID'].value_counts().index

# DATA ABOVE; Filters and Groups below

In [None]:
flt_homers = df_atbats['EVENT_CD'] == 23
flt_redsox = (df_atbats['GAME_ID'].str.startswith('BOS')) | (df_atbats['AWAY_TEAM_ID'] == 'BOS')
flt_yankees = (df_atbats['GAME_ID'].str.startswith('NYA')) | (df_atbats['AWAY_TEAM_ID'] == 'NYA')

In [None]:
filters = {}
for team in df_atbats['AWAY_TEAM_ID'].value_counts().index:
   filters[team] = (df_atbats['GAME_ID'].str.startswith(team)) | (df_atbats['AWAY_TEAM_ID'] == team)

In [None]:
df_atbats[(filters['BOS']) & (~df_atbats['GAME_ID'].str.startswith('BOS'))]['EVENT_CD'].value_counts()[13]

In [None]:
df_atbats[(filters['BOS']) & (df_atbats['EVENT_CD'] == 13)].shape

In [None]:
df_atbats.columns

In [None]:
len(df_atbats.GAME_ID.value_counts())

In [None]:
df_atbats[flt_redsox]['GAME_ID'].shape

In [None]:
df_atbats[flt_redsox & flt_homers]['BAT_HOME_ID'].value_counts()

In [None]:
for filter in filters:
    print(filter)
    print(df_atbats[filters[filter] & flt_homers]['BAT_HOME_ID'].value_counts())

In [None]:
df_atbats['GAME_ID'].str.startswith('BOS').value_counts()

In [None]:
grp_teams_homers = df_atbats.groupby([df_atbats['GAME_ID'].str[0:3], df_atbats['AWAY_TEAM_ID'], df_atbats['EVENT_CD']==23])

In [None]:
grp_teams_homers['EVENT_CD'].count()

In [None]:
grp_away = df_atbats.groupby(df_atbats['AWAY_TEAM_ID'])
grp_home = df_atbats.groupby(df_atbats['GAME_ID'].str[0:3])

In [None]:
grp_away[df_atbats[df_atbats['EVENT_CD'] == 23]].count()

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23]['EVENT_CD']

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23]['RESP_BAT_ID'].value_counts()

In [None]:
%matplotlib inline

In [None]:
grp_home['EVENT_CD'].head()

In [None]:
df_atbats[filters['NYA']

In [None]:
grp_hit_type=df_atbats.groupby('EVENT_CD')

In [None]:
grp_hit_type.describe()

In [None]:
for grpname,grprec in grp_hit_type:
    print(grpname)
    print(grprec)

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23].groupby([df_atbats['EVENT_CD'] == 23]['EVENT_CD'])

In [None]:
df_atbats[df_atbats['EVENT_CD' == 23]].groupby('BAT_HOME_ID').min()

In [None]:
for f in var:
    print(f)
    

In [None]:
type(var)

In [None]:
var.n

In [None]:
type(var.p)

In [None]:
var.grep('^r.*')

In [None]:
flt = '*.ip*'

In [None]:
%ls {flt}