In [1]:
import pandas as pd
import os
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
import numpy as np 

cwd = os.getcwd()
project_dir = Path(cwd).resolve().parents[0]
interim_data_dir = os.path.join(project_dir, 'data/interim/')

clean_game_log = os.path.join(interim_data_dir, 'game_log.pkl')
clean_events = os.path.join(interim_data_dir, 'events.pkl')
clean_hits = os.path.join(interim_data_dir, 'hits.pkl')
game_log = pd.read_pickle(clean_game_log)
events = pd.read_pickle(clean_events)
hits = pd.read_pickle(clean_hits)

pd.options.display.max_columns = None

mask1 = game_log.columns.str.contains('id')
mask2 = game_log.columns.str.contains('Date')
mask3 = game_log.columns.str.contains('HomeBatting.*PlayerID')
mask4 = game_log.columns.str.contains('VisitorBatting.*PlayerID')
mask5 = game_log.columns.str.contains('VisitorStartingPitcherID')
mask6 = game_log.columns.str.contains('HomeStartingPitcherID')
mask = mask1 | mask2 | mask3 | mask4 | mask5 | mask6

test = game_log.loc[:, mask]
test2 = test.rename(columns={'id': 'GAME_ID',
                             'HomeBatting1PlayerID': 'HomeBattingPlayerID1',
                             'HomeBatting2PlayerID': 'HomeBattingPlayerID2', 
                             'HomeBatting3PlayerID': 'HomeBattingPlayerID3', 
                             'HomeBatting4PlayerID': 'HomeBattingPlayerID4', 
                             'HomeBatting5PlayerID': 'HomeBattingPlayerID5', 
                             'HomeBatting6PlayerID': 'HomeBattingPlayerID6', 
                             'HomeBatting7PlayerID': 'HomeBattingPlayerID7', 
                             'HomeBatting8PlayerID': 'HomeBattingPlayerID8', 
                             'HomeBatting9PlayerID': 'HomeBattingPlayerID9',
                             'VisitorBatting1PlayerID': 'VisitorBattingPlayerID1',
                             'VisitorBatting2PlayerID': 'VisitorBattingPlayerID2',
                             'VisitorBatting3PlayerID': 'VisitorBattingPlayerID3',
                             'VisitorBatting4PlayerID': 'VisitorBattingPlayerID4',
                             'VisitorBatting5PlayerID': 'VisitorBattingPlayerID5',
                             'VisitorBatting6PlayerID': 'VisitorBattingPlayerID6',
                             'VisitorBatting7PlayerID': 'VisitorBattingPlayerID7',
                             'VisitorBatting8PlayerID': 'VisitorBattingPlayerID8',
                             'VisitorBatting9PlayerID': 'VisitorBattingPlayerID9'})

index = ['GAME_ID', 'Date', 'VisitorStartingPitcherID', 'HomeStartingPitcherID']

a = pd.wide_to_long(test2, ["HomeBattingPlayerID", "VisitorBattingPlayerID"], i=index, j="spot")
a = a.rename(columns={'HomeBattingPlayerID':'BAT_ID1', 'VisitorBattingPlayerID':'BAT_ID0'})
a = a.reset_index()
a = pd.wide_to_long(a, ["BAT_ID"], i=["GAME_ID", "spot"], j="home")
a = a.reset_index()

a['PitcherID'] = a['HomeStartingPitcherID']
a.loc[a.home == 1, 'PitcherID'] = a['VisitorStartingPitcherID']

main = a.merge(hits, on=['GAME_ID', 'BAT_ID'])
main = main.set_index(['GAME_ID', 'BAT_ID'])

main['Win_bin'] = main['Win']*1
main['cur_avg_win'] = main.groupby('BAT_ID')['Win_bin'].transform(lambda x: x.rolling(200, 50).mean())
main['avg_win'] = main.groupby('BAT_ID')['cur_avg_win'].shift(1)

# TODO: Pitcher lag needs to lag by game, not by batter
main['cur_pit_avg_win'] = main.groupby('PitcherID')['Win_bin'].transform(lambda x: x.rolling(100, 50).mean())
main['pit_avg_win'] = main.groupby('PitcherID')['cur_pit_avg_win'].shift(1)

## TODO: Add park factor

main = main.dropna()

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
X = main[['spot', 'home']] # , 'avg_win', 'pit_avg_win'
Y = main.loc[:, main.columns == 'Win']
X = poly.fit_transform(X)

# X = main[['spot', 'home', 'avg_win', 'pit_avg_win']]
Y = main.loc[:, main.columns == 'Win']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

logisticRegr = LogisticRegression(penalty='l1', solver='liblinear')
logisticRegr.fit(x_train, y_train.values.ravel())

probs = logisticRegr.predict_proba(X)
main['EstProb'] = probs[:,1]

selection = main.groupby('Date')['EstProb'].nlargest(2)

clean_selection = os.path.join(interim_data_dir, 'selection.pkl')
selection.to_pickle(clean_selection)

selection.describe()

# Switch to Lineupspot FEs
# Interactions and 
# Park Effects
# Batter and Pitcher Hand effects with interactions


