In [1]:
import gc
import os
import sys
import math
import random
import warnings
import pickle
from pathlib import Path
import copy
from joblib import Parallel, delayed

from functools import reduce
import pickle
from tqdm import tqdm
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# from pandarallel import pandarallel
# pandarallel.initialize()
warnings.simplefilter("ignore")
import ctypes as ct
from datetime import timedelta
from sklearn.metrics import mean_absolute_error

# import mlb
import statistics as st
import lightgbm as lgbm
from scipy.stats import norm


In [2]:
# sys.path.append('../../')
# import src.utils as utils

In [3]:
OFFSET = 45
MAX_LAG = 27
LAGS = list(range(OFFSET, MAX_LAG + OFFSET))

In [4]:
DATA_DIR = Path("/home/knikaido/work/MLB-Player-Digital-Engagement-Forecasting/data/")
MAIN_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting'
UPDATE_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting-update'

BASE_DIR = MAIN_DATA_DIR
TRAIN_DIR = MAIN_DATA_DIR / 'train'

In [5]:
players = pd.read_csv(BASE_DIR / 'players.csv')

rosters = pd.read_csv(TRAIN_DIR / 'rosters_train.csv')
targets = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv')
scores = pd.read_csv(TRAIN_DIR / 'playerBoxScores_train.csv')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()
standings = pd.read_csv(TRAIN_DIR / 'standings_train.csv')
awards = pd.read_csv(TRAIN_DIR / 'awards_train.csv')

In [6]:
players = pd.read_csv(BASE_DIR / 'players.csv')

In [7]:
def flatten(df, col):
    du = (df.pivot(index='playerId', columns='EvalDate', 
               values=col).add_prefix(f'{col}_').
      rename_axis(None, axis=1).reset_index())
    return du


def reducer(left, right):
    return left.merge(right, on='playerId')


def make_train_lag(df, lags):
    df['EvalDate'] = pd.to_datetime(df['date'], format="%Y%m%d")
    for lag in tqdm(lags):
        dp = df[['playerId','EvalDate'] + ['target1', 'target2', 'target3', 'target4']].copy()
        dp['EvalDate']  =dp['EvalDate'] + timedelta(days=lag) 
        df = df.merge(dp, on=['playerId', 'EvalDate'], suffixes=['',f'_{lag}'], how='left')
        gc.collect()
    df = df.sort_values(by=['playerId', 'EvalDate'])
    df = df.dropna()
    return df

def make_test_lag(sub, last):
    sub['playerId'] = sub['date_playerId'].apply(lambda s: int(  s.split('_')[1]  ) )
    assert sub.date.nunique() == 1
    dte = sub['date'].unique()[0]
    
    eval_dt = pd.to_datetime(dte, format='%Y%m%d')
    dtes = [eval_dt + timedelta(days = -k) for k in LAGS]
    mp_dtes = {eval_dt + timedelta(days = -k): k for k in LAGS}
    
    sl = last.loc[last['EvalDate'].between(dtes[-1], dtes[0]), ['EvalDate','playerId'] + ['target1', 'target2', 'target3', 'target4']].copy()
    sl['EvalDate'] = sl['EvalDate'].map(mp_dtes)
    du = [flatten(sl, col) for col in ['target1', 'target2', 'target3', 'target4']]
    du = reduce(reducer, du)
    return du, eval_dt

In [48]:
class Rt4kaidoTest:
    def __init__(self, train_features_dict, models_notgameday, models_gameday, usetimelinefeature=False):
        
        self.usetimelinefeature = usetimelinefeature
        self.train_features_dict = train_features_dict
        self.feature_cols1 = train_features_dict['feature_cols1']
        self.feature_cols2 = train_features_dict['feature_cols2']
        self.feature_cols3 = train_features_dict['feature_cols3']
        self.feature_cols4 = train_features_dict['feature_cols4']
        self.models_notgameday = models_notgameday
        self.models_gameday = models_gameday


        self.test_players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight']
        self.test_rosters_cols = ['playerId', 'teamId', 'status']
        self.test_standings_cols = ['teamId', 'wildCardRank', 'sportGamesBack']
        self.test_scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances']
        

    def test_oneline(self, test_df, sample_prediction_df):
        
        null = np.nan
        true = True
        false = False
        
        sample_prediction_df = sample_prediction_df.reset_index(drop=True)

        # creat dataset
        sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                            .map(lambda x: int(x.split('_')[1]))
        # Dealing with missing values
        if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
            test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
        else:
            test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
            for col in rosters.columns:
                if col == 'playerId': continue
                test_rosters[col] = np.nan

        if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
            test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
        else:
            test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
            for col in scores.columns:
                if col == 'playerId': continue
                test_scores[col] = np.nan

        if test_df['standings'].iloc[0] == test_df['standings'].iloc[0]:
            test_standings = pd.DataFrame(eval(test_df['standings'].iloc[0]))
        else:
            test_standings = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
            for col in standings.columns:
                if col == 'playerId': continue
                test_standings[col] = np.nan

        test_scores = test_scores.groupby('playerId').sum().reset_index()
        test = sample_prediction_df[['playerId']].copy()
        test = test.merge(self.train_features_dict['players'][self.test_players_cols], on='playerId', how='left')
        test = test.merge(test_rosters[self.test_rosters_cols], on='playerId', how='left')
        test = test.merge(test_scores[self.test_scores_cols], on='playerId', how='left')
        test = test.merge(self.train_features_dict['player_target_stats'], how='left', left_on=["playerId"],right_on=["playerId"])
        test = test.merge(test_standings[self.test_standings_cols], on='teamId', how='left')
        test = test.merge(self.train_features_dict['team_target_stats'], how='left', left_on=["teamId"],right_on=["playerId"], suffixes=('', 'team_'))
        test['wildCardRank'] = test['wildCardRank'].astype(float)


        test['label_playerId'] = test['playerId'].map(self.train_features_dict['player2num'])
        test['label_primaryPositionName'] = test['primaryPositionName'].map(self.train_features_dict['position2num'])
        test['label_teamId'] = test['teamId'].map(self.train_features_dict['teamid2num'])
        test['label_status'] = test['status'].map(self.train_features_dict['status2num'])
        test['label_birthCity'] = test['birthCity'].map(self.train_features_dict['birthCityn2num'])

        date_ = pd.to_datetime(test_df.index[0], format="%Y%m%d")
        test['annual_day'] = (date_ - pd.to_datetime(date_.year, format="%Y")) /  timedelta(days=1)
        test['week_day'] = date_.weekday()
        test['month'] = date_.month
        test['season_info'] = 2
        
        if self.usetimelinefeature:
            test['date'] = test_df.index[0]
            
            test['gameday'] = ~test['battingOrder'].isna()*1
            test = pd.merge(test, self.train_features_dict['train_last_game'], on=['playerId'], how='left')
            test['daysSinceLastGame'] = (pd.to_datetime(test['date'], format="%Y%m%d") - pd.to_datetime(test['lastdate'], format="%Y%m%d")).dt.days
            test.loc[test['gameday']==1,'daysSinceLastGame']=0
            
            self.train_features_dict['train_last_game'] = pd.merge(self.train_features_dict['train_last_game'], test[test['gameday']==1][['playerId','date']], on=['playerId'], how='left')
            self.train_features_dict['train_last_game']['lastdate'].update(self.train_features_dict['train_last_game']['date'])
            self.train_features_dict['train_last_game'] = self.train_features_dict['train_last_game'][['playerId', 'lastdate']]
                        
            test['rosterday'] = ~test['status'].isna()*1
            test = pd.merge(test, self.train_features_dict['train_last_roster'], on=['playerId'], how='left')
            test['daysSinceLastRoster'] = (pd.to_datetime(test['date'], format="%Y%m%d") - pd.to_datetime(test['lastroster'], format="%Y%m%d")).dt.days
            test.loc[test['rosterday']==1,'daysSinceLastRoster']=0
            
            self.train_features_dict['train_last_roster'] = pd.merge(self.train_features_dict['train_last_roster'], test[test['rosterday']==1][['playerId','date']], on=['playerId'], how='left')
            self.train_features_dict['train_last_roster']['lastroster'].update(self.train_features_dict['train_last_roster']['date'])
            self.train_features_dict['train_last_roster'] = self.train_features_dict['train_last_roster'][['playerId', 'lastroster']]

        test_gameday = test[test['gameday']==1]

        if len(test_gameday) != 0:
            gameday_index = list(test_gameday.index)

            test_X = test.iloc[gameday_index]

            pred1 = self.models_gameday[0][4].predict(test_X[self.feature_cols1])
            pred2 = self.models_gameday[1][4].predict(test_X[self.feature_cols2])
            pred3 = self.models_gameday[2][4].predict(test_X[self.feature_cols3])
            pred4 = self.models_gameday[3][4].predict(test_X[self.feature_cols4])

            # merge submission
            sample_prediction_df['target1'].iloc[gameday_index] = np.clip(pred1, 0, 100)
            sample_prediction_df['target2'].iloc[gameday_index] = np.clip(pred2, 0, 100)
            sample_prediction_df['target3'].iloc[gameday_index] = np.clip(pred3, 0, 100)
            sample_prediction_df['target4'].iloc[gameday_index] = np.clip(pred4, 0, 100)

        # ------------------------------------------------------------

        test_notgameday = test[test['gameday']==0]
        if len(test_notgameday) != 0:
            notgameday_index = list(test_notgameday.index)

            test_X = test.iloc[notgameday_index]

            pred1 = self.models_notgameday[0][4].predict(test_X[self.feature_cols1])
            pred2 = self.models_notgameday[1][4].predict(test_X[self.feature_cols2])
            pred3 = self.models_notgameday[2][4].predict(test_X[self.feature_cols3])
            pred4 = self.models_notgameday[3][4].predict(test_X[self.feature_cols4])

            # merge submission
            sample_prediction_df['target1'].iloc[notgameday_index] = np.clip(pred1, 0, 100)
            sample_prediction_df['target2'].iloc[notgameday_index] = np.clip(pred2, 0, 100)
            sample_prediction_df['target3'].iloc[notgameday_index] = np.clip(pred3, 0, 100)
            sample_prediction_df['target4'].iloc[notgameday_index] = np.clip(pred4, 0, 100)

        sample_prediction_df = sample_prediction_df.fillna(0.)

        del sample_prediction_df['playerId']
        
        return sample_prediction_df

## Inference

In [49]:
# 新しいtrain_update, players, nextDayPlayerEngagementの展開されたものを読み込める必要あり

class LocalTest:
    def __init__(self, start_day, end_day):
        self.start_day = start_day
        self.end_day = end_day
        
    def make_env(self):
    
        self.original_data = pd.read_csv(UPDATE_DATA_DIR / "train_updated.csv")
        self.original_data = self.original_data[(self.original_data['date'] >= self.start_day) & (self.original_data['date'] <= self.end_day)].reset_index(drop=True)
        
        test_players = pd.read_csv(UPDATE_DATA_DIR / 'players.csv')
        self.players_test = test_players[test_players['playerForTestSetAndFuturePreds']==True]['playerId'].unique()
        
        self.test_targets = pd.read_csv(UPDATE_DATA_DIR / 'train/nextDayPlayerEngagement_train.csv')
        self.test_targets = self.test_targets[(self.test_targets['date'] >= self.start_day) & (self.test_targets['date'] <= self.end_day) & (self.test_targets['playerId'].isin(self.players_test))].reset_index(drop=True)
        
        self.scores = []
        return self
        
    def iter_test(self):
        self.num = len(self.original_data['date'].unique())
        self.current = 0
        self.predict_flag = True
        
        return self
    
    def __iter__(self):
        return self

    def __next__(self):
        assert self.predict_flag == True, 'You must call `predict()` successfully before you can continue with `iter_test()`'
     
        if self.current == self.num:
            print(np.mean(self.scores))
            raise StopIteration()

        test_df = self.original_data.iloc[self.current:self.current+1].set_index('date')

        sample_prediction_df = pd.DataFrame()
        sample_prediction_df['date'] = [test_df.index[0]] * len(self.players_test)
        next_day = (pd.to_datetime(sample_prediction_df['date'], format="%Y%m%d") + timedelta(days=1)).astype(str).str.replace('-', '')
        # next_day.str.cat(players_test.astype(str))
        sample_prediction_df['date_playerId'] = [next_day[0] + '_' + str(p_) for p_ in self.players_test]
        sample_prediction_df['target1'] = 0
        sample_prediction_df['target2'] = 0
        sample_prediction_df['target3'] = 0
        sample_prediction_df['target4'] = 0

        sample_prediction_df = sample_prediction_df.set_index('date')

        self.current += 1
        self.predict_flag = False
        return test_df, sample_prediction_df
    
    def predict(self, sample_prediction_df):
        
        self.predict_flag = True
    
        sample_prediction_df = sample_prediction_df.reset_index()
        date_playerId = sample_prediction_df['date_playerId'].str.split('_', expand=True)
        sample_prediction_df['date'] = (pd.to_datetime(date_playerId[0], format="%Y%m%d") + timedelta(days=-1)).astype(str).str.replace('-', '').values.astype(int)
        sample_prediction_df['playerId'] = date_playerId[1].values.astype(int)
        target_oneday = pd.merge(sample_prediction_df, self.test_targets, how='left', on=['date', 'playerId'], suffixes=('', '_true'))
        score = mean_absolute_error(target_oneday.loc[:, 'target1':'target4'], target_oneday.loc[:, 'target1_true':'target4_true'])
        
        self.scores.append(score)


In [50]:
with open("../74/output/train_features_dict74.pickle", mode="rb") as f:
    train_features_dict = pickle.load(f)
    
with open('../74/output/models74.pickle', mode="rb") as f:
    models = pickle.load(f)
    
# with open('../78/output/models78_gameday.pickle', mode="rb") as f:
#     models_gameday = pickle.load(f)
    
## self.train_features_dict['train_last_roster']が更新されちゃうから，これは絶対呼ぶ
rt4kaido_test = Rt4kaidoTest(train_features_dict, models, models, usetimelinefeature=True)

In [51]:
mlb = LocalTest(20210501, 20210531)
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

In [52]:
scores = []
for (test_df, sample_prediction_df) in iter_test: # make predictions here
    
    sample_prediction_df = rt4kaido_test.test_oneline(test_df, sample_prediction_df)
    
    env.predict(sample_prediction_df)
#     continue
#     break

1.3145874903503743
