In [73]:
from src.data import lineup
import pandas as pd
from pathlib import Path
import numpy as np
import recordlinkage

In [80]:
interim = Path('../data/interim')
teams = pd.read_pickle(interim / 'teams.pkl')
rosters = pd.read_pickle(interim / 'rosters.pkl')
park_records = pd.read_pickle(Path(interim) / 'park_records.pkl')

In [81]:
lp = lineup.LineupProcessor('2021-04-01')

In [82]:
lp.read_data('../data/raw/')

In [83]:
raw_lineup_wide = lp.get_raw_data()

In [84]:
recode = {
    'STL': 'SLN',
    'LAD': 'LAN',
    'TB': 'TBA',
    'NYM': 'NYN',
    'CWS': 'CHA',
    'SF': 'SFN',
    'NYY': 'NYA',
    'KC': 'KCA',
    'SD': 'SDN',
    'LAA': 'ANA',
}
for old_code, new_code in recode.items():
    raw_lineup_wide.loc[
        raw_lineup_wide.home_team_code == old_code,
        'home_team_code'
    ] = new_code

    raw_lineup_wide.loc[
        raw_lineup_wide.away_team_code == old_code,
        'away_team_code'
    ] = new_code

In [85]:
raw_lineup_wide.drop(
        columns=['home_team_name', 'away_team_name'], inplace=True)

raw_lineup_wide['id'] = np.arange(len(raw_lineup_wide))

raw_lineup = raw_lineup_wide.melt(
    id_vars=['id', 'home_team_code', 'away_team_code'],
    var_name='lineup_id',
    value_name='Name')

del raw_lineup['id']

# Get First and Last Names
raw_lineup['FirstName'] = raw_lineup.Name.str.split().str.get(0)
raw_lineup['FirstName'] = \
    raw_lineup['FirstName'].str.replace('.', '', regex=False)
raw_lineup['LastName'] = raw_lineup.Name.str.split().str.get(1)
raw_lineup['LastName'] = \
    raw_lineup['LastName'].str.replace('.', '', regex=False)
raw_lineup.drop(columns=['Name'], inplace=True)


raw_lineup.loc[
    (raw_lineup['FirstName'] == 'M') & 
    (raw_lineup['LastName'] == 'Bumgarner'), 
    'FirstName'] = 'Madison'

raw_lineup.loc[
    (raw_lineup['FirstName'] == 'Michael') & 
    (raw_lineup['LastName'] == 'Taylor'), 
    'FirstName'] = 'Michael A'

raw_lineup.loc[
    (raw_lineup['FirstName'] == 'Hyun') & 
    (raw_lineup['LastName'] == 'Jin'), 
    ['FirstName', 'LastName']] = ['Hyun Jin', 'Ryu']

# set whether players are at home or away, and spot in lineup
raw_lineup[['home', 'lineup_id']] = \
    raw_lineup['lineup_id'].str.split(pat='_', expand=True)

# set team variable for each player
raw_lineup['TEAM'] = raw_lineup['away_team_code']
raw_lineup.loc[raw_lineup.home == 'home', 'TEAM'] = \
    raw_lineup['home_team_code']

raw_lineup['base'] = (
    raw_lineup['LastName'].str.replace(r"[\"\',]", '', regex=True).str[0:4].str.pad(width=4, side='right', fillchar='-').str.lower() +
    raw_lineup['FirstName'].str[0].str.lower() + '0'
)

raw_lineup.loc[
    (raw_lineup['FirstName'] == 'Giancarlo') & 
    (raw_lineup['LastName'] == 'Stanton'), 
    'base'] = 'stanm0'

raw_lineup['id'] = np.arange(0, raw_lineup.shape[0])

raw_lineup_x = raw_lineup[['FirstName', 'LastName', 'base', 'id', 'TEAM']]
raw_lineup_x.columns = ['l_FirstName', 'LastName', 'base', 'l_id', 'l_TEAM']

In [86]:
rosters['base'] = rosters['PLAYER_ID'].str[0:6]
rosters = rosters.loc[rosters.year >= 2015]
rosters = rosters.dropna()
rosters = rosters.sort_values(['PLAYER_ID', 'year'], ascending = [True, False])

roster_match = rosters.groupby('PLAYER_ID')[['base', 'FirstName', 'LastName', 'TEAM']].first().reset_index()
roster_match.columns = ['PLAYER_ID', 'base', 'FirstName', 'LastName', 'TEAM_2019']

In [87]:
test = pd.merge(raw_lineup, roster_match, on=['base', 'LastName', 'FirstName'], how='outer', indicator = True)

test['dup_count'] = test.groupby('id')['id'].transform('count')
test = test.loc[(test.dup_count == 1) | (test.TEAM == test.TEAM_2019)]
test['dup_count'] = test.groupby('id')['id'].transform('count')

del test['dup_count']

test[test._merge == 'left_only']

Unnamed: 0,home_team_code,away_team_code,lineup_id,FirstName,LastName,home,TEAM,base,id,PLAYER_ID,TEAM_2019,_merge
249,SEA,SFN,player5,Taylor,Trammell,home,SEA,tramt0,239.0,,,left_only
271,KCA,TEX,player7,Kyle,Isbel,home,KCA,isbek0,261.0,,,left_only
272,CIN,SLN,player7,Jonathan,India,home,CIN,indij0,262.0,,,left_only


In [88]:
lineup_with_id = test.loc[test._merge == 'both']

lineup_with_id = lineup_with_id[[
    'PLAYER_ID', 'TEAM', 'lineup_id', 'home',
    'home_team_code', 'away_team_code'
]]

players_grid = lineup_with_id.pivot(
    index=['home_team_code', 'away_team_code'],
    columns=['lineup_id', 'home'],
    values=['PLAYER_ID'])

players_grid.columns = players_grid.columns.droplevel()
players_grid.columns = players_grid.columns.swaplevel(0, 1)
players_grid.columns = [
    '_'.join(col).strip() for col in players_grid.columns.values
]
players_grid = players_grid.set_index(
    ['home_pitcher', 'away_pitcher'], append=True
)

In [89]:
clean_lineups = players_grid.stack().reset_index().set_index([
    'home_team_code', 'away_team_code'
])
clean_lineups.columns = [
    'home_pitcher', 'away_pitcher', 'spot', 'BAT_ID'
]
clean_lineups[['home', 'spot']] = \
    clean_lineups['spot'].str.split(pat='_', expand=True)
clean_lineups['spot'] = clean_lineups['spot'].str.slice(start=-1)

d = {'home': True, 'away': False}
clean_lineups['home'] = clean_lineups['home'].map(d)
clean_lineups

clean_lineups['PIT_ID'] = np.where(
    clean_lineups['home'] == True,
    clean_lineups['away_pitcher'],
    clean_lineups['home_pitcher']
)

clean_lineups['OWN_PIT_ID'] = np.where(
    clean_lineups['home'] == True,
    clean_lineups['home_pitcher'],
    clean_lineups['away_pitcher']
)

clean_lineups['PIT_TEAM_ID'] = np.where(
    clean_lineups['home'] == True,
    clean_lineups.index.get_level_values('away_team_code'),
    clean_lineups.index.get_level_values('home_team_code'),
)

clean_lineups['BAT_TEAM_ID'] = np.where(
    clean_lineups['home'] == True,
    clean_lineups.index.get_level_values('home_team_code'),
    clean_lineups.index.get_level_values('away_team_code'),
)

## Come back and fix this for double headers
clean_lineups['GAME_ID'] = \
    clean_lineups.index.get_level_values('home_team_code') + \
    self.__date.replace('-', '') + '0'

clean_lineups['year'] = int(self.__year)

NameError: name 'self' is not defined

In [90]:
clean_lineups

Unnamed: 0_level_0,Unnamed: 1_level_0,home_pitcher,away_pitcher,spot,BAT_ID,home,PIT_ID,OWN_PIT_ID,PIT_TEAM_ID,BAT_TEAM_ID
home_team_code,away_team_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ANA,CHA,bundd001,gioll001,9,madrn001,False,bundd001,gioll001,ANA,CHA
ANA,CHA,bundd001,gioll001,1,andet001,False,bundd001,gioll001,ANA,CHA
ANA,CHA,bundd001,gioll001,2,robel002,False,bundd001,gioll001,ANA,CHA
ANA,CHA,bundd001,gioll001,3,abrej003,False,bundd001,gioll001,ANA,CHA
ANA,CHA,bundd001,gioll001,4,moncy001,False,bundd001,gioll001,ANA,CHA
...,...,...,...,...,...,...,...,...,...,...
WAS,NYN,schem001,degrj001,4,bellj005,True,degrj001,schem001,NYN,WAS
WAS,NYN,schem001,degrj001,5,schwk001,True,degrj001,schem001,NYN,WAS
WAS,NYN,schem001,degrj001,6,casts001,True,degrj001,schem001,NYN,WAS
WAS,NYN,schem001,degrj001,7,gomey001,True,degrj001,schem001,NYN,WAS
