# This notebook section duplicates Jupyter-intro-data <br>Full data pulled from retrosheet.org
* Chadwick tools used for converting retrosheet data http://chadwick.sourceforge.net/doc/cwtools.html

In [None]:
import pandas as pd
import os
import matplotlib

# % and %% are ipthyon 'magics'
# ! is ipython's shell execution shortcut

In [None]:
%history

In [None]:
!ls data_public/*.EV*

### Here we're preparing the file.  Run the cwevent executable with -n and capture the header

In [None]:
! cwevent -n data_public/2017SEA.EVA |head -1 >data_public/atbats.txt

### Now we'll shell out and run a loop to invoke the converter on each event file.  We'll also concatentate the roster files in a separate command

In [None]:
!cwevent data_public/2017ANA.EVA >data_public/x.x

In [None]:
%%sh
for x in $(ls -1 data_public/*.EV*); do cwevent $x >>data_public/atbats.txt; done
cat data_public/*.ROS >data_public/rosters.txt

#### Useful techniques for interacting with the shell

* Use assignment to capture the ouput of your ! command

In [None]:
files = !ls

In [None]:
type(files)

In [None]:
files.grep('\.i.*')

* Use {expression} to pass from ipython to the shell

In [None]:
extension = 'ipynbb'

In [None]:
!ls *.{extension}

* Hmm... Extra trailing 'b'. Lets take a slice of the extension string

In [None]:
!ls *.{extension[0:-1]}

In [None]:
df_atbats=pd.read_csv('./data_public/atbats.txt')
df_players=pd.read_csv('./data_public/players.txt')

#### Here we're reading a two column file into a Python key:val (dictionary) data structure
* This allows us to use '2B' and 'HR' vs 21 and 23 to identify event rows of interest

In [None]:
event_code = {}
with open("./data_public/event_codes.txt") as f:
    for line in f:
       (val, key) = line.split()
       event_code[key] = int(val)

# DATA ABOVE; Filters and Groups below

#### Create a few filters to restrict rows (filter is not a defined Pandas term)
* These types of restrictions return a pandas series
* The series is named for the index 
* Each series is a set of booleans indexed identically to the source

In [None]:
flt_homers = df_atbats['EVENT_CD'] == event_code['HR']
flt_redsox = (df_atbats['GAME_ID'].str.startswith('BOS')) | (df_atbats['AWAY_TEAM_ID'] == 'BOS')
flt_yankees = (df_atbats['GAME_ID'].str.startswith('NYA')) | (df_atbats['AWAY_TEAM_ID'] == 'NYA')

In [None]:
flt_redsox.head()

#### Using value_counts to inspect a column
* Here we use value_counts to generate list of entities from a series 
* Functionally identical to "<b> cat file |cut -d, -f4|sort|uniq -c |sort -rn</b>"
* Functionally identical to "<b>select column, count(column) group by column order by count(column)</b>")
* value_counts() takes a series or index and returns a series 
* Note that the index of the series is the value in the column

#### First we'll inspect the results of a filter 

In [None]:
flt_homers.value_counts()

#### Next we'll look at a true data series

In [None]:
df_atbats['AWAY_TEAM_ID'].value_counts()

* Here's the index of the generated series

In [None]:
df_atbats['AWAY_TEAM_ID'].value_counts().index

* An index is an iterable object
* We traverse it here to make dictionaries of filters

In [None]:
flt_home_team = {}
flt_away_team = {}
for team in df_atbats['AWAY_TEAM_ID'].value_counts().index:
    flt_home_team[team] = (df_atbats['GAME_ID'].str.startswith(team)) & (df_atbats['BAT_HOME_ID'] == 1)
    flt_away_team[team] = (df_atbats['AWAY_TEAM_ID'] == team) & (df_atbats['BAT_HOME_ID'] == 0)

* Our first practical question... How many doubles did the Red Sox have in 2017?

In [None]:
df_atbats[((flt_home_team['BOS']) | (flt_away_team['BOS'])) \
          & (df_atbats['EVENT_CD'] == event_code['DOUBLE'])].shape

* Who had them??

In [None]:
df_atbats[((flt_home_team['BOS']) | (flt_away_team['BOS'])) \
          & (df_atbats['EVENT_CD'] == event_code['DOUBLE'])]['RESP_BAT_ID']

## Merging data sets
* Let's revisit the Red Sox Doubles

In [None]:
type(df_atbats[((flt_home_team['BOS']) | (flt_away_team['BOS'])) \
          & (df_atbats['EVENT_CD'] == event_code['DOUBLE'])]['RESP_BAT_ID'])

In [None]:
type(df_atbats[((flt_home_team['BOS']) | (flt_away_team['BOS'])) \
          & (df_atbats['EVENT_CD'] == event_code['DOUBLE'])])

* Pandas merge() is a dataframe method so we'll have to use our dataframe version of the result set

In [None]:
df_players.head(3)

In [None]:
df_atbats[((flt_home_team['BOS']) | (flt_away_team['BOS'])) \
          & (df_atbats['EVENT_CD'] == event_code['DOUBLE'])]\
.merge(df_players,left_on = 'RESP_BAT_ID', right_on = 'ID')['Last'].value_counts()

In [None]:
df_atbats.columns

In [None]:
len(df_atbats.GAME_ID.value_counts())

In [None]:
df_atbats[flt_redsox]['GAME_ID'].shape

In [None]:
df_atbats[flt_redsox & flt_homers]['BAT_HOME_ID'].value_counts()

In [None]:
df_atbats['GAME_ID'].str.startswith('BOS').value_counts()

In [None]:
x = df_atbats.groupby(by = ['AWAY_TEAM_ID','EVENT_CD'])['EVENT_CD'].count()

In [None]:
type(x)

In [None]:
len(x.index.get_level_values(1))

In [None]:
x.reset_index()

In [None]:
grp_teams_homers = df_atbats.groupby([df_atbats['GAME_ID'].str[0:3], df_atbats['AWAY_TEAM_ID'], df_atbats['EVENT_CD']==23])

In [None]:
grp_teams_homers['EVENT_CD'].count()

In [None]:
grp_away = df_atbats.groupby(df_atbats['AWAY_TEAM_ID'])
grp_home = df_atbats.groupby(df_atbats['GAME_ID'].str[0:3])

In [None]:
grp_away[df_atbats[df_atbats['EVENT_CD'] == 23]].count()

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23]['EVENT_CD']

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23]['RESP_BAT_ID'].value_counts()

In [None]:
%matplotlib inline

In [None]:
grp_home['EVENT_CD'].head()

In [None]:
df_atbats[filters['NYA']

In [None]:
grp_hit_type=df_atbats.groupby('EVENT_CD')

In [None]:
grp_hit_type.describe()

In [None]:
for grpname,grprec in grp_hit_type:
    print(grpname)
    print(grprec)

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23].groupby([df_atbats['EVENT_CD'] == 23]['EVENT_CD'])

In [None]:
df_atbats[df_atbats['EVENT_CD' == 23]].groupby('BAT_HOME_ID').min()

In [None]:
for f in var:
    print(f)
    

In [None]:
type(var)

In [None]:
var.n

In [None]:
type(var.p)

In [None]:
var.grep('^r.*')

In [None]:
flt = '*.ip*'

In [None]:
%ls {flt}

#### Create a few filters to restrict rows (filter is not a defined Pandas term)
* These types of restrictions return a pandas series
* The series is named for the index 
* Each series is a set of booleans indexed identically to the source

In [None]:
flt_homers = df_atbats['EVENT_CD'] == event_code['HR']
flt_redsox = (df_atbats['GAME_ID'].str.startswith('BOS')) | (df_atbats['AWAY_TEAM_ID'] == 'BOS')
flt_yankees = (df_atbats['GAME_ID'].str.startswith('NYA')) | (df_atbats['AWAY_TEAM_ID'] == 'NYA')

In [None]:
flt_redsox.head()

#### Using value_counts to inspect a column
* Here we use value_counts to generate list of entities from a series 
* Functionally identical to "<b> cat file |cut -d, -f4|sort|uniq -c |sort -rn</b>"
* Functionally identical to "<b>select column, count(column) group by column order by count(column)</b>")
* value_counts() takes a series or index and returns a series 
* Note that the index of the series is the value in the column

#### First we'll inspect the results of a filter 

In [None]:
flt_homers.value_counts()

#### Next we'll look at a true data series

In [None]:
df_atbats['AWAY_TEAM_ID'].value_counts()

* Here's the index of the generated series

In [None]:
df_atbats['AWAY_TEAM_ID'].value_counts().index

* An index is an iterable object
* We traverse it here to make dictionaries of filters

In [None]:
flt_home_team = {}
flt_away_team = {}
for team in df_atbats['AWAY_TEAM_ID'].value_counts().index:
    flt_home_team[team] = (df_atbats['GAME_ID'].str.startswith(team)) & (df_atbats['BAT_HOME_ID'] == 1)
    flt_away_team[team] = (df_atbats['AWAY_TEAM_ID'] == team) & (df_atbats['BAT_HOME_ID'] == 0)

* Our first practical question... How many doubles did the Red Sox have in 2017?

In [None]:
df_atbats[((flt_home_team['BOS']) | (flt_away_team['BOS'])) \
          & (df_atbats['EVENT_CD'] == event_code['DOUBLE'])].shape

* Who had them??

In [None]:
df_atbats[((flt_home_team['BOS']) | (flt_away_team['BOS'])) \
          & (df_atbats['EVENT_CD'] == event_code['DOUBLE'])]['RESP_BAT_ID']

## Merging data sets
* Let's revisit the Red Sox Doubles

In [None]:
type(df_atbats[((flt_home_team['BOS']) | (flt_away_team['BOS'])) \
          & (df_atbats['EVENT_CD'] == event_code['DOUBLE'])]['RESP_BAT_ID'])

In [None]:
type(df_atbats[((flt_home_team['BOS']) | (flt_away_team['BOS'])) \
          & (df_atbats['EVENT_CD'] == event_code['DOUBLE'])])

* Pandas merge() is a dataframe method so we'll have to use our dataframe version of the result set

In [None]:
df_players.head(3)

In [None]:
df_atbats[((flt_home_team['BOS']) | (flt_away_team['BOS'])) \
          & (df_atbats['EVENT_CD'] == event_code['DOUBLE'])]\
.merge(df_players,left_on = 'RESP_BAT_ID', right_on = 'ID')['Last'].value_counts()

In [None]:
df_atbats.columns

In [None]:
len(df_atbats.GAME_ID.value_counts())

In [None]:
df_atbats[flt_redsox]['GAME_ID'].shape

In [None]:
df_atbats[flt_redsox & flt_homers]['BAT_HOME_ID'].value_counts()

In [None]:
df_atbats['GAME_ID'].str.startswith('BOS').value_counts()

In [None]:
x = df_atbats.groupby(by = ['AWAY_TEAM_ID','EVENT_CD'])['EVENT_CD'].count()

In [None]:
type(x)

In [None]:
len(x.index.get_level_values(1))

In [None]:
x.reset_index()

In [None]:
grp_teams_homers = df_atbats.groupby([df_atbats['GAME_ID'].str[0:3], df_atbats['AWAY_TEAM_ID'], df_atbats['EVENT_CD']==23])

In [None]:
grp_teams_homers['EVENT_CD'].count()

In [None]:
grp_away = df_atbats.groupby(df_atbats['AWAY_TEAM_ID'])
grp_home = df_atbats.groupby(df_atbats['GAME_ID'].str[0:3])

In [None]:
grp_away[df_atbats[df_atbats['EVENT_CD'] == 23]].count()

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23]['EVENT_CD']

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23]['RESP_BAT_ID'].value_counts()

In [None]:
%matplotlib inline

In [None]:
grp_home['EVENT_CD'].head()

In [None]:
df_atbats[filters['NYA']

In [None]:
grp_hit_type=df_atbats.groupby('EVENT_CD')

In [None]:
grp_hit_type.describe()

In [None]:
for grpname,grprec in grp_hit_type:
    print(grpname)
    print(grprec)

In [None]:
df_atbats[df_atbats['EVENT_CD'] == 23].groupby([df_atbats['EVENT_CD'] == 23]['EVENT_CD'])

In [None]:
df_atbats[df_atbats['EVENT_CD' == 23]].groupby('BAT_HOME_ID').min()

In [None]:
for f in var:
    print(f)
    

In [None]:
type(var)

In [None]:
var.n

In [None]:
type(var.p)

In [None]:
var.grep('^r.*')

In [None]:
flt = '*.ip*'

In [None]:
%ls {flt}