In [26]:
import pandas as pd
import os
import glob
from pathlib import Path
import recordlinkage

In [27]:
input_filepath = '~/github/bts/data/raw'
lineups_csv = Path(input_filepath) / "Lineups/lineups2019-08-04.csv"
df = pd.read_csv(lineups_csv)

In [28]:
clean_rosters = '~/github/bts/data/processed/rosters.pkl'
rosters = pd.read_pickle(clean_rosters)
roster2019 = rosters[rosters.Year == '2019']

In [29]:
raw_lineup_wide = df
raw_lineup_wide.drop(
    columns=['home_team_name', 'away_team_name'], inplace=True)

raw_lineup = raw_lineup_wide.melt(
    id_vars=['home_team_code', 'away_team_code'],
    var_name = 'lineup_id',
    value_name = 'Name')

# Get First and Last Names
raw_lineup['FirstName'] = raw_lineup.Name.str.split().str.get(0)
raw_lineup['FirstName'] = raw_lineup['FirstName'].str.replace('.', '', regex=False)
raw_lineup['LastName'] = raw_lineup.Name.str.split().str.get(1)
raw_lineup['LastName'] = raw_lineup['LastName'].str.replace('.', '', regex=False)
raw_lineup.drop(columns=['Name'], inplace = True)

# set whether players are at home or away, and spot in lineup
raw_lineup[['home','lineup_id']] = raw_lineup['lineup_id'].str.split(pat='_', expand=True)

# set team variable for each player
raw_lineup['TEAM'] = raw_lineup['away_team_code']
raw_lineup.loc[raw_lineup.home == 'home', 'TEAM'] = raw_lineup['home_team_code']

In [30]:
# set up record linking
thresh = 0.4
indexer = recordlinkage.Index()
indexer.block(['TEAM', 'LastName'])
pairs = indexer.index(raw_lineup, roster2019)

# Generate matches
compare = recordlinkage.Compare()
compare.exact('TEAM', 'TEAM', label='Team')
compare.exact('LastName', 'LastName', label='LastName')
compare.string('FirstName','FirstName',threshold=thresh,label='FirstName')
features = compare.compute(pairs, raw_lineup, roster2019)

In [34]:
# keep best matches
lineup_with_id = features[features.sum(axis=1) >= 3].reset_index()[['level_0', 'level_1']]

# merge matches
lineup_with_id = lineup_with_id.join(roster2019, on='level_1', rsuffix='_roster')
lineup_with_id = lineup_with_id.join(raw_lineup, on='level_0', rsuffix='_lineup')

# clean up matched data
lineup_with_id = lineup_with_id[['PLAYER_ID', 'TEAM', 'lineup_id', 'home', 'home_team_code','away_team_code']]
lineup_with_id = lineup_with_id.set_index(['home_team_code', 'away_team_code', 'home', 'lineup_id'])
lineup_with_id = lineup_with_id.reset_index()

    home_team_code away_team_code  home lineup_id PLAYER_ID TEAM
0              BAL            TOR  away   pitcher  reids001  TOR
1              PHI            CWS  away   pitcher  loper003  CWS
2              PHI            CWS  away   player9  loper003  CWS
3              CLE            LAA  away   pitcher  barrj003  LAA
4               TB            MIA  away   pitcher  smitc006  MIA
..             ...            ...   ...       ...       ...  ...
295            HOU            SEA  home   player9  reddj001  HOU
296            MIN             KC  home   player9  cavej001  MIN
297            TEX            DET  home   player9  mathj001  TEX
298            OAK            STL  home   player9  garnd001  OAK
299            NYY            BOS  home   player9  taucm001  NYY

[300 rows x 6 columns]


In [41]:
players_grid = lineup_with_id.pivot(index=['home_team_code', 'away_team_code'], 
                     columns=['lineup_id', 'home'],
                     values=['PLAYER_ID'])
players_grid.columns = players_grid.columns.droplevel()
players_grid.columns = players_grid.columns.swaplevel(0, 1)
players_grid.columns = ['_'.join(col).strip() for col in players_grid.columns.values]
players_grid = players_grid.set_index(['home_pitcher', 'away_pitcher'], append=True)

clean_lineups = players_grid.stack().reset_index().set_index(['home_team_code', 'away_team_code'])
clean_lineups.columns = ['home_pitcher', 'away_pitcher', 'spot', 'BAT_ID']
clean_lineups[['home','spot']] = clean_lineups['spot'].str.split(pat='_', expand=True)
clean_lineups['spot'] = clean_lineups['spot'].str.slice(start=-1)

d = {'home': True, 'away': False}
clean_lineups['home'] = clean_lineups['home'].map(d) 
clean_lineups

Unnamed: 0_level_0,Unnamed: 1_level_0,home_pitcher,away_pitcher,spot,BAT_ID,home
home_team_code,away_team_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ARI,WSH,clart003,corbp001,9,corbp001,False
ARI,WSH,clart003,corbp001,1,turnt001,False
ARI,WSH,clart003,corbp001,2,eatoa002,False
ARI,WSH,clart003,corbp001,3,renda001,False
ARI,WSH,clart003,corbp001,4,sotoj001,False
...,...,...,...,...,...,...
TEX,DET,payap001,zimmj003,3,andre001,True
TEX,DET,payap001,zimmj003,4,mazan001,True
TEX,DET,payap001,zimmj003,5,calhw001,True
TEX,DET,payap001,zimmj003,6,odorr001,True
