In [49]:
import gc
import os
import sys
import math
import random
import warnings
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)
import pickle
from tqdm import tqdm
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# from pandarallel import pandarallel
# pandarallel.initialize()
warnings.simplefilter("ignore")
import ctypes as ct
from datetime import timedelta

from sklearn.metrics import mean_absolute_error

import statistics as st
import lightgbm as lgbm
from scipy.stats import norm

In [50]:
sys.path.append('../../')
import src.utils as utils

## Param

In [51]:
EXP_NUM = 81
NFOLDS = 5
SEED = 42
TRAIN_DATE = 'date < 20210601'

In [52]:
# def set_seed(seed: int = 42):
#     random.seed(seed)
#     np.random.seed(seed)
#     os.environ["PYTHONHASHSEED"] = str(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)  # type: ignore
#     torch.backends.cudnn.deterministic = True  # type: ignore
#     torch.backends.cudnn.benchmark = False  # type: ignore
# set_seed(SEED)

## Dir

In [53]:
DATA_DIR = Path("/home/knikaido/work/MLB-Player-Digital-Engagement-Forecasting/data/")
MAIN_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting-update'
TRAIN_DIR = MAIN_DATA_DIR / 'train'
OUTPUT_DIR = Path('./output/')

In [54]:
players = pd.read_csv(MAIN_DATA_DIR / 'players.csv')
seasons = pd.read_csv(MAIN_DATA_DIR / 'seasons.csv')
# salaries = pd.read_csv(MAIN_DATA_DIR / 'mlbSalaries.csv')
teams = pd.read_csv(MAIN_DATA_DIR / 'teams.csv')

rosters = pd.read_csv(TRAIN_DIR / 'rosters_train.csv').query(TRAIN_DATE)
targets = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv').query(TRAIN_DATE)
scores = pd.read_csv(TRAIN_DIR / 'playerBoxScores_train.csv').query(TRAIN_DATE)
scores = scores.groupby(['playerId', 'date']).sum().reset_index()
standings = pd.read_csv(TRAIN_DIR / 'standings_train.csv').query(TRAIN_DATE)
playerTwitterFollowers = pd.read_csv(TRAIN_DIR / 'playerTwitterFollowers_train.csv').query(TRAIN_DATE)

# events = pd.read_csv(TRAIN_DIR / 'events_train.csv')
# events = events.groupby(['gameDate']).sum().reset_index()


In [55]:
train_elements_dict = {"players":players, 
                       "rosters":rosters, 
                       "targets":targets, 
                       "scores":scores, 
                       "seasons":seasons, 
                       "teams":teams, 
                       "standings":standings}

In [56]:
def map_team_name(name):
    names = name.split('-')
    result = ''
    for n in names:
        if n == 'st':
            n = 'st.'
        result += f' {n.capitalize()}'

    return result[1:]

In [57]:
def calc_corr(df):
    # 相関係数行列を作成
    corr_mat = df.corr(method='pearson')

    # 行（列）サイズを取得
    n = corr_mat.shape[0]
    corr_ary = []

    for i in range(n):
        for j in range(i):
            if i == j:
                continue
            corr_ary.append(corr_mat.iloc[i,j])

    return corr_ary

In [58]:
def calc_probs(pid,df,temp):
    to_append=[pid,'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','']
    targets=['target1','target2','target3','target4']
    z=1
    for target in targets:
        target_prob = temp[target].tolist()
        mean = np.mean(target_prob)
        std = np.std(target_prob)
        median = st.median(target_prob)
        distribution = norm(mean, std)
        min_weight = min(target_prob)
        max_weight = max(target_prob)
        values = list(np.linspace(min_weight, max_weight))
        probabilities = [distribution.pdf(v) for v in values]
        max_value = max(probabilities)
        max_index = probabilities.index(max_value)
        to_append[z]=mean
        to_append[z+1]=median
        to_append[z+2]=std
        to_append[z+3]=min_weight
        to_append[z+4]=max_weight
        to_append[z+5]=temp[target].skew()
        to_append[z+6]=temp[target].kurt()

        z=z+7
    corr_ = calc_corr(temp[['target1', 'target2', 'target3', 'target4']])
    to_append[z:] = corr_  
    df_length = len(df)
    df.loc[df_length] = to_append
    return df

In [59]:
def count_consecutive_items_n_cols(df, col_name_list, output_col):
    cum_sum_list = [
        (df[col_name] != df[col_name].shift(1)).cumsum().tolist() for col_name in col_name_list
    ]
    df[output_col] = df.groupby(
        ["_".join(map(str, x)) for x in zip(*cum_sum_list)]
    ).cumcount() + 1
    return df

In [60]:
def extract_season(date_raw, season_start_end):
    idxes = 0
    for raw in season_start_end.iloc():
        idx_ = ((date_raw >= raw.iloc[0]) & (date_raw <= raw.iloc[1])) * 1
        idxes += idx_
    return idxes

In [61]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = lgbm.LGBMRegressor(**params)
    model.fit(x_train, y_train, 
        eval_set=[(x_train, y_train), (x_valid, y_valid)],  
        early_stopping_rounds=verbose, 
#         eval_metric=custom_mae_metric,
        verbose=verbose)
    oof_pred = model.predict(x_valid)
    oof_pred = np.clip(oof_pred, 0, 100)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score

In [62]:
def my_timeseries_fold(train):
    
    tr_idxs = []
    val_idxs = []
    
    tr_idx = (train['date'].astype(int) < 20190901)
    val_idx = (train['date'].astype(int) >= 20190901) & (train['date'].astype(int) < 20191001)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)
    
    tr_idx = (train['date'].astype(int) < 20200801)
    val_idx = (train['date'].astype(int) >= 20200801) & (train['date'].astype(int) < 20200901)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)

    tr_idx = (train['date'].astype(int) < 20200901)
    val_idx = (train['date'].astype(int) >= 20200901) & (train['date'].astype(int) < 20201001)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)

#     tr_idx = (train['date'].astype(int) < 20201001)
#     val_idx = (train['date'].astype(int) >= 20201001) & (train['date'].astype(int) < 20201028)
#     tr_idxs.append(tr_idx)
#     val_idxs.append(val_idx)

    
    tr_idx = (train['date'].astype(int) < 20210401)
    val_idx = (train['date'].astype(int) >= 20210401) & (train['date'].astype(int) < 20210501)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)

    tr_idx = (train['date'].astype(int) < 20210501)
    val_idx = (train['date'].astype(int) >= 20210501) & (train['date'].astype(int) < 20210601)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)
    
    return tr_idxs, val_idxs

In [63]:
class Rt4kaidoTrain:
    def __init__(self, usetimelinefeature=False):
        
        self.usetimelinefeature = usetimelinefeature
        self.targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
        self.players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight', 'playerForTestSetAndFuturePreds']
        self.rosters_cols = ['playerId', 'teamId', 'status', 'date']
        self.salaries_cols = ['teamId', 'salary', 'year']
        self.standings_cols = ['teamId', 'wildCardRank', 'sportGamesBack', 'date']
        self.transactions_cols = ['playerId', 'transaction_flag', 'date']
        self.stat_cols = ["playerId", "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
                        "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
                        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
                        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt",
                        'tgt1_2_corr', 'tgt1_3_corr', 'tgt2_3_corr', 'tgt1_4_corr', 'tgt2_4_corr', 'tgt3_4_corr']

        self.scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances', 'date']

        self.feature_cols1 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols3 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols4 = ['week_day', 'annual_day', 'month', 'label_playerId', 'label_primaryPositionName', 'label_teamId', 'label_birthCity',
                        'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 
        
        
        # lightgbm
        self.params1 = {            
            'random_state': SEED,
            'objective':'mae',
            'n_estimators': 10000,
            'learning_rate': 0.1,
            'max_depth': 20,
            'min_child_weight': 6,
            'subsample': 0.6,
            'colsample_bytree': 0.7,
            'reg_lambda': 0.08135847923727514,
            'reg_alpha': 1.49433391288005,
            'feature_fraction': 0.5197130924180956,
            'bagging_fraction': 0.7648977376363217,
            'bagging_freq': 13,
            'num_leaves': 181,
            'min_child_samples': 17
        }


        self.params2 = {            
            'random_state': SEED,
            'objective':'mae',
            'n_estimators': 10000,
            'learning_rate': 0.1,
            'max_depth': 16,
            'min_child_weight': 3,
            'subsample': 0.7,
            'colsample_bytree': 0.5,
            'reg_lambda': 0.004528826382334437,
            'reg_alpha': 1.9645054145486578,
            'feature_fraction': 0.9015395368511052,
            'bagging_fraction': 0.9958830097531277,
            'bagging_freq': 12,
            'num_leaves': 761,
            'min_child_samples': 96
        }


        self.params3 = {            
            'random_state': SEED,
            'objective':'mae',
            'n_estimators': 10000,
            'learning_rate': 0.1,
            'max_depth': 5,
            'min_child_weight': 18,
            'subsample': 0.5,
            'colsample_bytree': 0.5,
            'reg_lambda': 0.02821111013127755,
            'reg_alpha': 0.0046946396301503586,
            'feature_fraction': 0.9780515566739537,
            'bagging_fraction': 0.7202933086435114,
            'bagging_freq': 11,
            'num_leaves': 137,
            'min_child_samples': 45
        }


        self.params4 = {            
            'random_state': SEED,
            'objective':'mae',
            'n_estimators': 10000,
            'learning_rate': 0.1,
            'max_depth': 9,
            'min_child_weight': 8,
            'subsample': 0.6,
            'colsample_bytree': 0.8,
            'reg_lambda': 24.80468829330036,
            'reg_alpha': 0.3674254895182708,
            'feature_fraction': 0.8268416192212926,
            'bagging_fraction': 0.36802486339139545,
            'bagging_freq': 19,
            'num_leaves': 909,
            'min_child_samples': 62
        }


    def make_feature(self, train_elements_dict):

        players = train_elements_dict['players']
        rosters = train_elements_dict['rosters']
        targets = train_elements_dict['targets']
        scores = train_elements_dict['scores']
        seasons = train_elements_dict['seasons']
        teams = train_elements_dict['teams']
        standings = train_elements_dict['standings']

        print('calc target stat ... ', end="")

        ## target stats
        targets_train = targets.merge(rosters[self.rosters_cols], on=['playerId', 'date'], how='left')
        targets_train = targets_train[(targets_train['date'] >= 20210501) & (targets_train['date'] < 20210601)]

        playerId_list = targets_train['playerId'].unique()
        player_target_probs = pd.DataFrame(columns = self.stat_cols)  
        for pid in tqdm(playerId_list):
            temp = targets_train[targets_train['playerId'] == pid]
            player_target_stats=calc_probs(pid,player_target_probs,temp)

        teamId_list = targets_train['teamId'].dropna().unique()
        team_target_probs = pd.DataFrame(columns = self.stat_cols)
        for pid in tqdm(teamId_list):
            temp = targets_train[targets_train['teamId'] == pid]
            team_target_stats=calc_probs(pid,team_target_probs,temp)

        team_stat_cols = self.stat_cols
        team_stat_cols = team_stat_cols[:1] + ["team_" + word for word in team_stat_cols[1:]]
        team_target_stats.columns = team_stat_cols

        self.feature_cols1 += self.stat_cols[1:-6]
        self.feature_cols2 += self.stat_cols[1:-6]
        self.feature_cols3 += self.stat_cols[1:-6]
        self.feature_cols4 += self.stat_cols[1:-6]

        self.feature_cols1 += team_stat_cols[1:]
        self.feature_cols2 += team_stat_cols[1:]
        self.feature_cols3 += team_stat_cols[1:]
        self.feature_cols4 += team_stat_cols[1:]

        print('done.')

        print('preprocess ... ', end="")
        ## salaries
        # salaries = salaries.groupby(['year', 'team']).sum()['salary'].reset_index()
        # salaries['team'] = salaries['team'].apply(map_team_name)
        # salaries = salaries.merge(teams, left_on='team', right_on='name', how='inner')
        # salaries = salaries.rename(columns={'id': 'teamId'})

        ## seasons
        seasons = seasons.fillna('0000-00-00')
        for c_ in seasons.columns[1:]:
            seasons[c_] = seasons[c_].str.replace('-', '').astype(int)

        ## players
        players['DOY'] = pd.to_datetime(players['DOB'], format="%Y-%m-%d").dt.year
        players['mlbDebutYear'] = pd.to_datetime(players['mlbDebutDate'], format="%Y-%m-%d").dt.year
        players['DebutAge'] = players['mlbDebutYear'] - players['DOY']

        print('done.')

        print('creat feature ... ', end="")
        # creat feature
        train = targets[self.targets_cols].merge(players[self.players_cols], on=['playerId'], how='left')
        train = train.merge(rosters[self.rosters_cols], on=['playerId', 'date'], how='left')
        train = train.merge(scores[self.scores_cols], on=['playerId', 'date'], how='left')
        train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])
        train = train.merge(standings[self.standings_cols], on=['teamId', 'date'], how='left')
        train = train.merge(team_target_stats, how='left', left_on=["teamId"],right_on=["playerId"], suffixes=('', 'team_'))
        date_ = pd.to_datetime(train['date'], format="%Y%m%d")
        train['annual_day'] = (date_ - pd.to_datetime(date_.dt.year, format="%Y")) /  timedelta(days=1)
        train['week_day'] = date_.dt.weekday
        train['month'] = date_.dt.month
        train['year'] = date_.dt.year

        # label encoding
        player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
        position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
        birthCityn2num = {c: i for i, c in enumerate(train['birthCity'].unique())}
        teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
        status2num = {c: i for i, c in enumerate(train['status'].unique())}
        train['label_playerId'] = train['playerId'].map(player2num)
        train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
        train['label_birthCity'] = train['birthCity'].map(birthCityn2num)
        train['label_teamId'] = train['teamId'].map(teamid2num)
        train['label_status'] = train['status'].map(status2num)

        ## season_info
        on_preseason_idxes = extract_season(train['date'], seasons[['preSeasonStartDate', 'preSeasonEndDate']])
        on_season_idxes = extract_season(train['date'], seasons[['regularSeasonStartDate', 'regularSeasonEndDate']]) * 2
        on_postseason_idxes = extract_season(train['date'], seasons[['postSeasonStartDate', 'postSeasonEndDate']]) * 3

        special_days = seasons['lastDate1stHalf'].to_list() + seasons['allStarDate'].to_list() + seasons['firstDate2ndHalf'].to_list()
        special_idxes = 0
        for day in special_days:
            special_idxes += (train['date'] == day) * 4

        on_total_season_idxes = on_preseason_idxes
        on_total_season_idxes[on_season_idxes==2] = 2
        on_total_season_idxes[on_postseason_idxes==3] = 3
        on_total_season_idxes[special_idxes==4] = 4

        train['season_info'] = on_total_season_idxes

        ## only on season
        on_whole_idxes = extract_season(train['date'], seasons[['seasonStartDate', 'seasonEndDate']])
        train = train[on_whole_idxes == 1].reset_index(drop=True)

        # train = train.merge(playerTwitterFollowers, how='left', on=["playerId", 'date'])

        ## only test_player
        train = train[train['playerForTestSetAndFuturePreds']==True].reset_index(drop=True)

        print('done.')
        
        train_features_dict = {'players': players,
                               'seasons': seasons,
                                'player_target_stats': player_target_stats,
                                'team_target_stats': team_target_stats,
                                'player2num': player2num, 
                                'position2num': position2num, 
                                'birthCityn2num': birthCityn2num,
                                'teamid2num': teamid2num,
                                'status2num': status2num,
                                'feature_cols1': self.feature_cols1,
                                'feature_cols2': self.feature_cols2,
                                'feature_cols3': self.feature_cols3,
                                'feature_cols4': self.feature_cols4
                              }
        
        if self.usetimelinefeature:
            ## game_info
            train['gameday'] = ~train['battingOrder'].isna()*1
            train.sort_values(by=['playerId','date'],inplace=True,ascending=True)

            train=count_consecutive_items_n_cols(train,['playerId','gameday'],'daysSinceLastGame')
            train.loc[train['gameday']==1,'daysSinceLastGame'] = 0

            train_game = train[train['gameday']==1]
            train_last_game = train_game[~train_game.duplicated(subset='playerId', keep='last')][['playerId', 'date']]
            train_last_game.columns = ['playerId', 'lastdate']
            train_player_unique = pd.DataFrame(train['playerId'].unique(), columns=['playerId'])
            train_last_game = pd.merge(train_player_unique, train_last_game, on=['playerId'], how='left' )
            train_last_game = train_last_game.fillna(20171231)
            
            train_features_dict['train_last_game'] = train_last_game
            self.feature_cols1 += ['daysSinceLastGame']
            self.feature_cols2 += ['daysSinceLastGame']
            self.feature_cols3 += ['daysSinceLastGame']
            self.feature_cols4 += ['daysSinceLastGame']
            
            
            ## rosters_info
            train['rosterday'] = ~train['status'].isna()*1
            train.sort_values(by=['playerId','date'],inplace=True,ascending=True)

            train=count_consecutive_items_n_cols(train,['playerId','rosterday'],'daysSinceLastRoster')
            train.loc[train['rosterday']==1,'daysSinceLastRoster'] = 0

            train_roster= train[train['rosterday']==1]
            train_last_roster = train_roster[~train_roster.duplicated(subset='playerId', keep='last')][['playerId', 'date']]
            train_last_roster.columns = ['playerId', 'lastroster']
            train_player_unique = pd.DataFrame(train['playerId'].unique(), columns=['playerId'])
            train_last_roster = pd.merge(train_player_unique, train_last_roster, on=['playerId'], how='left' )
            train_last_roster = train_last_roster.fillna(20171231)
            
            train_features_dict['train_last_roster'] = train_last_roster
            self.feature_cols1 += ['daysSinceLastRoster']
            self.feature_cols2 += ['daysSinceLastRoster']
            self.feature_cols3 += ['daysSinceLastRoster']
            self.feature_cols4 += ['daysSinceLastRoster']


        return train, train_features_dict
    
    def train_and_evaluate(self, train, isgamedayonly=False):
        
        if isgamedayonly:
            train = train[train['gameday'] == 1].reset_index(drop=True)
            
        train_X = train
        train_y = train[['target1', 'target2', 'target3', 'target4']]

        oof = np.zeros(train_y.shape) - 1.0
        y_valids = np.zeros(train_y.shape) - 1.0
        
        tr_idxs, val_idxs = my_timeseries_fold(train)
        
        
        model1s = []
        model2s = []
        model3s = []
        model4s = []
        scores = []
        oof = np.zeros(train_y.shape) - 1.0
        y_valids = np.zeros(train_y.shape) - 1.0

        for idx in range(NFOLDS):
            
            tr_idx = tr_idxs[idx]
            val_idx = val_idxs[idx]

            x_train = train_X.loc[tr_idx].reset_index(drop=True)
            y_train = train_y.loc[tr_idx].reset_index(drop=True)
            x_valid = train_X.loc[val_idx].reset_index(drop=True)
            y_valid = train_y.loc[val_idx].reset_index(drop=True)


            oof1, model1, score1 = fit_lgbm(
                x_train[self.feature_cols1], y_train['target1'],
                x_valid[self.feature_cols1], y_valid['target1'],
                self.params1
            )
            oof2, model2, score2 = fit_lgbm(
                x_train[self.feature_cols2], y_train['target2'],
                x_valid[self.feature_cols2], y_valid['target2'],
                self.params2
            )
            oof3, model3, score3 = fit_lgbm(
                x_train[self.feature_cols3], y_train['target3'],
                x_valid[self.feature_cols3], y_valid['target3'],
                self.params3
            )
            oof4, model4, score4 = fit_lgbm(
                x_train[self.feature_cols4], y_train['target4'],
                x_valid[self.feature_cols4], y_valid['target4'],
                self.params4
            )

            score = (score1+score2+score3+score4) / 4
            scores.append(score)
            print(f'score: {score}')

            model1s.append(model1)
            model2s.append(model2)
            model3s.append(model3)
            model4s.append(model4)
            oof[val_idx, 0] = oof1
            oof[val_idx, 1] = oof2
            oof[val_idx, 2] = oof3
            oof[val_idx, 3] = oof4
            y_valids[val_idx, 0] = y_valid['target1'].values
            y_valids[val_idx, 1] = y_valid['target2'].values
            y_valids[val_idx, 2] = y_valid['target3'].values
            y_valids[val_idx, 3] = y_valid['target4'].values


        oof_indexes = []
        for i in range(NFOLDS):
            oof_indexes.extend(val_idxs[i][val_idxs[i]==True].index.to_list())
            
        print("\n--------------------------------------------")
        for i in range(NFOLDS):
            print(f'{i}fold mae: {scores[i]}')
        mae = mean_absolute_error(y_valids[oof_indexes, :], oof[oof_indexes, :])
        print("oof mae:", mae)
        print("--------------------------------------------")

        oof_df = train[self.targets_cols]
        oof_df.iloc[oof_indexes, 1:5] = oof[oof_indexes, :]

        models = np.array([model1s, model2s, model3s, model4s])
        
        print("---------------April evaluate-------------------")
        weights = [0.05, 0.1, 0.15, 0.2, 0.5]
        pred1s = 0
        pred2s = 0
        pred3s = 0
        pred4s = 0
        for i in range(NFOLDS):
            pred1 = models[0][i].predict(x_valid[self.feature_cols1])
            pred2 = models[1][i].predict(x_valid[self.feature_cols2])
            pred3 = models[2][i].predict(x_valid[self.feature_cols3])
            pred4 = models[3][i].predict(x_valid[self.feature_cols4])
            oof_valid_april = np.clip(np.array([pred1, pred2, pred3, pred4]).T, 0, 100)
            mae = mean_absolute_error(y_valid, oof_valid_april)
            print(f'{i}fold mae: {mae}')
            pred1s += pred1 * weights[i]
            pred2s += pred2 * weights[i]
            pred3s += pred3 * weights[i]
            pred4s += pred4 * weights[i]
            
        oof_valid_april = np.clip(np.array([pred1s, pred2s, pred3s, pred4s]).T, 0, 100)
        mae = mean_absolute_error(y_valid, oof_valid_april)
        print("oof mae:", mae)
        print("--------------------------------------------")

        return oof_df.iloc[oof_indexes], models
            

In [64]:
class Rt4kaidoTest:
    def __init__(self, train_features_dict, models_notgameday, models_gameday, usetimelinefeature=False):
        
        self.usetimelinefeature = usetimelinefeature
        self.train_features_dict = train_features_dict
        self.feature_cols1 = train_features_dict['feature_cols1']
        self.feature_cols2 = train_features_dict['feature_cols2']
        self.feature_cols3 = train_features_dict['feature_cols3']
        self.feature_cols4 = train_features_dict['feature_cols4']
        self.models_notgameday = models_notgameday
        self.models_gameday = models_gameday


        self.test_players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight']
        self.test_rosters_cols = ['playerId', 'teamId', 'status']
        self.test_standings_cols = ['teamId', 'wildCardRank', 'sportGamesBack']
        self.test_scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances']
        

    def test_oneline(self, test_df, sample_prediction_df):
        
        null = np.nan
        true = True
        false = False
        
        sample_prediction_df = sample_prediction_df.reset_index(drop=True)

        # creat dataset
        sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                            .map(lambda x: int(x.split('_')[1]))
        # Dealing with missing values
        if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
            test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
        else:
            test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
            for col in rosters.columns:
                if col == 'playerId': continue
                test_rosters[col] = np.nan

        if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
            test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
        else:
            test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
            for col in scores.columns:
                if col == 'playerId': continue
                test_scores[col] = np.nan

        if test_df['standings'].iloc[0] == test_df['standings'].iloc[0]:
            test_standings = pd.DataFrame(eval(test_df['standings'].iloc[0]))
        else:
            test_standings = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
            for col in standings.columns:
                if col == 'playerId': continue
                test_standings[col] = np.nan

        test_scores = test_scores.groupby('playerId').sum().reset_index()
        test = sample_prediction_df[['playerId']].copy()
        test = test.merge(self.train_features_dict['players'][self.test_players_cols], on='playerId', how='left')
        test = test.merge(test_rosters[self.test_rosters_cols], on='playerId', how='left')
        test = test.merge(test_scores[self.test_scores_cols], on='playerId', how='left')
        test = test.merge(self.train_features_dict['player_target_stats'], how='left', left_on=["playerId"],right_on=["playerId"])
        test = test.merge(test_standings[self.test_standings_cols], on='teamId', how='left')
        test = test.merge(self.train_features_dict['team_target_stats'], how='left', left_on=["teamId"],right_on=["playerId"], suffixes=('', 'team_'))
        test['wildCardRank'] = test['wildCardRank'].astype(float)


        test['label_playerId'] = test['playerId'].map(self.train_features_dict['player2num'])
        test['label_primaryPositionName'] = test['primaryPositionName'].map(self.train_features_dict['position2num'])
        test['label_teamId'] = test['teamId'].map(self.train_features_dict['teamid2num'])
        test['label_status'] = test['status'].map(self.train_features_dict['status2num'])
        test['label_birthCity'] = test['birthCity'].map(self.train_features_dict['birthCityn2num'])

        date_ = pd.to_datetime(test_df.index[0], format="%Y%m%d")
        test['annual_day'] = (date_ - pd.to_datetime(date_.year, format="%Y")) /  timedelta(days=1)
        test['week_day'] = date_.weekday()
        test['month'] = date_.month
        test['season_info'] = 2
        
        if self.usetimelinefeature:
            test['date'] = test_df.index[0]
            
            test['gameday'] = ~test['battingOrder'].isna()*1
            test = pd.merge(test, self.train_features_dict['train_last_game'], on=['playerId'], how='left')
            test['daysSinceLastGame'] = (pd.to_datetime(test['date'], format="%Y%m%d") - pd.to_datetime(test['lastdate'], format="%Y%m%d")).dt.days
            test.loc[test['gameday']==1,'daysSinceLastGame']=0
            
            self.train_features_dict['train_last_game'] = pd.merge(self.train_features_dict['train_last_game'], test[test['gameday']==1][['playerId','date']], on=['playerId'], how='left')
            self.train_features_dict['train_last_game']['lastdate'].update(self.train_features_dict['train_last_game']['date'])
            self.train_features_dict['train_last_game'] = self.train_features_dict['train_last_game'][['playerId', 'lastdate']]
                        
            test['rosterday'] = ~test['status'].isna()*1
            test = pd.merge(test, self.train_features_dict['train_last_roster'], on=['playerId'], how='left')
            test['daysSinceLastRoster'] = (pd.to_datetime(test['date'], format="%Y%m%d") - pd.to_datetime(test['lastroster'], format="%Y%m%d")).dt.days
            test.loc[test['rosterday']==1,'daysSinceLastRoster']=0
            
            self.train_features_dict['train_last_roster'] = pd.merge(self.train_features_dict['train_last_roster'], test[test['rosterday']==1][['playerId','date']], on=['playerId'], how='left')
            self.train_features_dict['train_last_roster']['lastroster'].update(self.train_features_dict['train_last_roster']['date'])
            self.train_features_dict['train_last_roster'] = self.train_features_dict['train_last_roster'][['playerId', 'lastroster']]

        test_gameday = test[test['gameday']==1]

        if len(test_gameday) != 0:
            gameday_index = list(test_gameday.index)

            test_X = test.iloc[gameday_index]

            pred1 = self.models_gameday[0][4].predict(test_X[self.feature_cols1])
            pred2 = self.models_gameday[1][4].predict(test_X[self.feature_cols2])
            pred3 = self.models_gameday[2][4].predict(test_X[self.feature_cols3])
            pred4 = self.models_gameday[3][4].predict(test_X[self.feature_cols4])

            # merge submission
            sample_prediction_df['target1'].iloc[gameday_index] = np.clip(pred1, 0, 100)
            sample_prediction_df['target2'].iloc[gameday_index] = np.clip(pred2, 0, 100)
            sample_prediction_df['target3'].iloc[gameday_index] = np.clip(pred3, 0, 100)
            sample_prediction_df['target4'].iloc[gameday_index] = np.clip(pred4, 0, 100)

        # ------------------------------------------------------------

        test_notgameday = test[test['gameday']==0]
        if len(test_notgameday) != 0:
            notgameday_index = list(test_notgameday.index)

            test_X = test.iloc[notgameday_index]

            pred1 = self.models_notgameday[0][4].predict(test_X[self.feature_cols1])
            pred2 = self.models_notgameday[1][4].predict(test_X[self.feature_cols2])
            pred3 = self.models_notgameday[2][4].predict(test_X[self.feature_cols3])
            pred4 = self.models_notgameday[3][4].predict(test_X[self.feature_cols4])

            # merge submission
            sample_prediction_df['target1'].iloc[notgameday_index] = np.clip(pred1, 0, 100)
            sample_prediction_df['target2'].iloc[notgameday_index] = np.clip(pred2, 0, 100)
            sample_prediction_df['target3'].iloc[notgameday_index] = np.clip(pred3, 0, 100)
            sample_prediction_df['target4'].iloc[notgameday_index] = np.clip(pred4, 0, 100)

        sample_prediction_df = sample_prediction_df.fillna(0.)

        del sample_prediction_df['playerId']
        
        return sample_prediction_df

In [65]:
rt4kaido_train = Rt4kaidoTrain(usetimelinefeature=True)
train, train_features_dict = rt4kaido_train.make_feature(train_elements_dict)

calc target stat ... 

100%|██████████| 2061/2061 [00:45<00:00, 44.90it/s]
100%|██████████| 30/30 [00:00<00:00, 42.42it/s]


done.
preprocess ... done.
creat feature ... done.


In [66]:
oof_df, models = rt4kaido_train.train_and_evaluate(train, isgamedayonly=False)
oof_df.to_csv(OUTPUT_DIR / f'oof{EXP_NUM}.csv')
with open(OUTPUT_DIR / f"models{EXP_NUM}.pickle", mode="wb") as f:
    pickle.dump(models, f)
    
# 現状，モデルをわけない
# oof_df_gameday, models_gameday = rt4kaido_train.train_and_evaluate(train, isgamedayonly=True)
# oof_df_gameday.to_csv(OUTPUT_DIR / f'oof{EXP_NUM}_gameday.csv')
# with open(OUTPUT_DIR / f"models{EXP_NUM}_gameday.pickle", mode="wb") as f:
#     pickle.dump(models_gameday, f)

Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.857229	valid_1's l1: 0.974221
[200]	training's l1: 0.854968	valid_1's l1: 0.972608
[300]	training's l1: 0.854753	valid_1's l1: 0.97218
[400]	training's l1: 0.852237	valid_1's l1: 0.969574
[500]	training's l1: 0.850355	valid_1's l1: 0.967905
[600]	training's l1: 0.848406	valid_1's l1: 0.966581
[700]	training's l1: 0.84372	valid_1's l1: 0.965996
[800]	training's l1: 0.838065	valid_1's l1: 0.96428
[900]	training's l1: 0.833719	valid_1's l1: 0.962662
[1000]	training's l1: 0.831322	valid_1's l1: 0.961844
[1100]	training's l1: 0.825241	valid_1's l1: 0.961118
Early stopping, best iteration is:
[1092]	training's l1: 0.826265	valid_1's l1: 0.961085
mae: 0.9608805172637186
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 1.51665	valid_1's l1: 2.07882
[200]	training's l1: 1.48626	valid_1's l1: 2.0823
Early stopping, best iteration is:
[106]	training's l1: 1.5139	valid_1's l1: 2.0

In [67]:
del train, rt4kaido_train

In [68]:
with open(OUTPUT_DIR / f"train_features_dict{EXP_NUM}.pickle", mode="wb") as f:
    pickle.dump(train_features_dict, f)

In [69]:
with open(OUTPUT_DIR / f"train_features_dict{EXP_NUM}.pickle", mode="rb") as f:
    train_features_dict = pickle.load(f)
    
with open(OUTPUT_DIR / f"models{EXP_NUM}.pickle", mode="rb") as f:
    models = pickle.load(f)

In [70]:
rt4kaido_test = Rt4kaidoTest(train_features_dict, models, models, usetimelinefeature=True)

## テストで取ってこれる一行はこんな感じ

In [71]:
## ここはまあよしなに読み込む
example_sample_submission = pd.read_csv(MAIN_DATA_DIR / "example_sample_submission.csv")
example_test = pd.read_csv(MAIN_DATA_DIR / "example_test.csv")

## nanを想定していくつかいじる
example_test['rosters'][1] = np.nan
example_test['playerBoxScores'][2] = np.nan
example_test['games'][3] = np.nan

## 一日ごと読み込んで，すべての日にちでうまく行くか．
for i in range(len(example_test)):
    test_df = example_test.set_index('date').iloc[i:i+1]
    sample_prediction_df = example_sample_submission[example_sample_submission['date']==test_df.index[0]].set_index('date')
    
    ## ここで処理
    sample_prediction_df = rt4kaido_test.test_oneline(test_df, sample_prediction_df)

In [72]:
sample_prediction_df

Unnamed: 0,date_playerId,target1,target2,target3,target4
0,20210501_488726,2.758700,6.426109,0.112746,2.472969
1,20210501_605218,0.001802,1.074192,0.021731,1.200912
2,20210501_621563,0.149710,2.482890,0.014776,1.053543
3,20210501_670084,0.019747,0.887168,0.005467,0.355083
4,20210501_670970,0.007792,0.425681,0.019823,0.128021
...,...,...,...,...,...
1182,20210501_596049,0.000000,0.282169,0.000966,0.153138
1183,20210501_642851,0.000000,1.119193,0.000003,0.066209
1184,20210501_596071,0.000399,0.280468,0.001131,0.097123
1185,20210501_664901,0.005415,0.552044,0.013369,0.221362
