In [1]:
import pandas as pd
import os
import glob
from pathlib import Path

cwd = os.getcwd()
project_dir = Path(cwd).resolve().parents[0]
raw_data_dir = os.path.join(project_dir, 'data/raw/')
interim_data_dir = os.path.join(project_dir, 'data/interim/')

In [2]:
clean_rosters = os.path.join(interim_data_dir, 'rosters.pkl')
rosters = pd.read_pickle(clean_rosters)

lineups_csv = os.path.join(raw_data_dir, "Lineups/lineups2019-08-04.csv")
lineups = pd.read_csv(lineups_csv)

lineups['GAME_ID'] = lineups['home_team_code']

In [3]:
a = pd.wide_to_long(lineups, ["home_player", "away_player"], i=['GAME_ID'], j="spot")
a = a.rename(columns={'home_player':'BAT_NAME1', 'away_player':'BAT_NAME0'})
a = a.reset_index()
a = pd.wide_to_long(a, ["BAT_NAME"], i=["GAME_ID", "spot"], j="home")
a = a.reset_index()

In [4]:
a['FirstName'] = a.BAT_NAME.str.split().str.get(0)
a['LastName'] = a.BAT_NAME.str.split().str.get(1)

a['LastName'] = a['LastName'].str.replace('.', '', regex=False)
a['FirstName'] = a['FirstName'].str.replace('.', '', regex=False)

a['TEAM'] = a['away_team_code']
a.loc[a.home == 1, 'TEAM'] = a['home_team_code']

In [5]:
a = a[['spot', 'home', 'home_team_code', 'away_team_code', 
       'away_pitcher', 'home_pitcher', 'FirstName', 'LastName', 'TEAM']]

In [6]:
roster2019 = rosters[rosters.Year == '2019']

In [7]:
import recordlinkage

indexer = recordlinkage.Index()
indexer.block(['TEAM', 'LastName'])
pairs = indexer.index(a, roster2019)
print(len(a), len(roster2019), len(pairs))

compare = recordlinkage.Compare()
compare.exact('TEAM', 'TEAM', label='Team')
compare.exact('LastName', 'LastName', label='LastName')
compare.string('FirstName','FirstName',threshold=0.4,label='FirstName')
features = compare.compute(pairs, a, roster2019)

270 1568 283


In [8]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

3.0    270
2.0     13
dtype: int64

In [9]:
test = features[features.sum(axis=1) >= 3].reset_index()[['level_0', 'level_1']]
test = test.join(roster2019, on='level_1', rsuffix='_roster')
test = test.join(a, on='level_0', rsuffix='_lineup')
test = test[['PLAYER_ID', 'TEAM', 'spot', 'home', 'home_team_code', 
             'away_team_code', 'home_pitcher', 'away_pitcher']]
test = test.rename(columns={'PLAYER_ID':'BAT_ID'})

In [10]:
test['Pitcher'] = test['home_pitcher']
test.loc[test.home == 1, 'Pitcher'] = test['away_pitcher']

test['PitcherTEAM'] = test['home_team_code']
test.loc[test.home == 1, 'PitcherTEAM'] = test['away_team_code']

test['FirstName'] = test.Pitcher.str.split().str.get(0)
test['LastName'] = test.Pitcher.str.split().str.get(1)

test['LastName'] = test['LastName'].str.replace('.', '', regex=False)
test['FirstName'] = test['FirstName'].str.replace('.', '', regex=False)

test = test.rename(columns={"TEAM": "BAT_TEAM", 'PitcherTEAM':'TEAM'})

In [11]:
indexer = recordlinkage.Index()
indexer.block(['TEAM', 'LastName'])
pairs = indexer.index(test, roster2019)
print(len(test), len(roster2019), len(pairs))

compare = recordlinkage.Compare()
compare.exact('TEAM', 'TEAM', label='Team')
compare.exact('LastName', 'LastName', label='LastName')
compare.string('FirstName','FirstName',threshold=0.4,label='FirstName')
features = compare.compute(pairs, test, roster2019)

270 1568 279


In [12]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

3.0    270
2.0      9
dtype: int64

In [15]:
test2 = features[features.sum(axis=1) >= 3].reset_index()[['level_0', 'level_1']]
test2 = test2.join(roster2019, on='level_1', rsuffix='_pitcher_roster')
test2 = test2.rename(columns={'PLAYER_ID':'PITCHER_ID'})
test2 = test2.join(test, on='level_0', rsuffix='_pitcher_lineup')
test2 = test2.rename(columns={'TEAM_pitcher_lineup':'PITCHER_TEAM'})
test2

Unnamed: 0,level_0,level_1,PITCHER_ID,LastName,FirstName,Hand,Hand2,TEAM,Pos,Year,...,spot,home,home_team_code,away_team_code,home_pitcher,away_pitcher,Pitcher,PITCHER_TEAM,FirstName_pitcher_lineup,LastName_pitcher_lineup
0,0,4316,reids001,Reid-Foley,Sean,R,R,TOR,P,2019,...,1,1,BAL,TOR,Jimmy Yacabonis,Sean Reid-Foley,Sean Reid-Foley,TOR,Sean,Reid-Foley
1,30,4316,reids001,Reid-Foley,Sean,R,R,TOR,P,2019,...,2,1,BAL,TOR,Jimmy Yacabonis,Sean Reid-Foley,Sean Reid-Foley,TOR,Sean,Reid-Foley
2,60,4316,reids001,Reid-Foley,Sean,R,R,TOR,P,2019,...,3,1,BAL,TOR,Jimmy Yacabonis,Sean Reid-Foley,Sean Reid-Foley,TOR,Sean,Reid-Foley
3,90,4316,reids001,Reid-Foley,Sean,R,R,TOR,P,2019,...,4,1,BAL,TOR,Jimmy Yacabonis,Sean Reid-Foley,Sean Reid-Foley,TOR,Sean,Reid-Foley
4,120,4316,reids001,Reid-Foley,Sean,R,R,TOR,P,2019,...,5,1,BAL,TOR,Jimmy Yacabonis,Sean Reid-Foley,Sean Reid-Foley,TOR,Sean,Reid-Foley
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,149,8823,happj001,Happ,JA,L,L,NYY,P,2019,...,5,0,NYY,BOS,J.A. Happ,David Price,J.A. Happ,NYY,JA,Happ
266,179,8823,happj001,Happ,JA,L,L,NYY,P,2019,...,6,0,NYY,BOS,J.A. Happ,David Price,J.A. Happ,NYY,JA,Happ
267,209,8823,happj001,Happ,JA,L,L,NYY,P,2019,...,7,0,NYY,BOS,J.A. Happ,David Price,J.A. Happ,NYY,JA,Happ
268,239,8823,happj001,Happ,JA,L,L,NYY,P,2019,...,8,0,NYY,BOS,J.A. Happ,David Price,J.A. Happ,NYY,JA,Happ


In [17]:
test2 = test2[['BAT_ID', 'BAT_TEAM', 'spot', 'home', 'PITCHER_ID', 'PITCHER_TEAM', 'home_team_code', 
             'away_team_code',]]
lineup = test2

In [18]:
clean_lineup = os.path.join(interim_data_dir, 'lineup.pkl')
lineup.to_pickle(clean_lineup)