In [1]:
import gc
import os
import sys
import math
import random
import warnings
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)

from tqdm import tqdm
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# from pandarallel import pandarallel
# pandarallel.initialize()
warnings.simplefilter("ignore")
import ctypes as ct
from datetime import timedelta

from sklearn.metrics import mean_absolute_error

import statistics as st
import lightgbm as lgbm
from scipy.stats import norm

In [2]:
sys.path.append('../../')
import src.utils as utils

## Param

In [3]:
EXP_NUM = 64
NFOLDS = 5
SEED = 42

In [4]:
# training lightgbm
params1 = {
            'random_state': SEED,
            'objective':'mae',
            'n_estimators': 10000,
            'learning_rate': 0.1,
            'max_depth': 4,
            'min_child_weight': 18,
            'subsample': 0.6,
            'colsample_bytree': 0.9,
            'reg_lambda': 0.11467861995039172,
            'reg_alpha': 0.06424319118759443,
            'feature_fraction': 0.9638843704312544,
            'bagging_fraction': 0.6767148669241133,
            'bagging_freq': 4
}


params2 = {
            'random_state': SEED,
            'objective':'mae',
            'n_estimators': 10000,
            'learning_rate': 0.1,
            'max_depth': 12,
            'min_child_weight': 4,
            'subsample': 0.8,
            'colsample_bytree': 0.6,
            'reg_lambda': 3.6789038580429465,
            'reg_alpha': 0.001295149012810786,
            'feature_fraction': 0.5155482034139958,
            'bagging_fraction': 0.711248241796355,
            'bagging_freq': 10
}


params3 = {
            'random_state': SEED,
            'objective':'mae',
            'n_estimators': 10000,
            'learning_rate': 0.1,
            'max_depth': 5,
            'min_child_weight': 8,
            'subsample': 0.6,
            'colsample_bytree': 0.8,
            'reg_lambda': 0.08386857038129647,
            'reg_alpha': 0.14416671920586854,
            'feature_fraction': 0.9779531931779787,
            'bagging_fraction': 0.457776092650538,
            'bagging_freq': 4
}

params4 = {
            'random_state': SEED,
            'objective':'mae',
            'n_estimators': 10000,
            'learning_rate': 0.1,
            'max_depth': 6,
            'min_child_weight': 13,
            'subsample': 0.8,
            'colsample_bytree': 0.6,
            'reg_lambda': 0.0376556735585165,
            'reg_alpha': 0.001510757076303828,
            'feature_fraction': 0.4335923168511571,
            'bagging_fraction': 0.9186369126073344,
            'bagging_freq': 10
}

In [5]:
utils.set_seed(SEED)

## Dir

In [6]:
DATA_DIR = Path("/home/knikaido/work/MLB-Player-Digital-Engagement-Forecasting/data/")
MAIN_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting'
TRAIN_DIR = MAIN_DATA_DIR / 'train'
OUTPUT_DIR = Path('./output/')

In [7]:
players = pd.read_csv(MAIN_DATA_DIR / 'players.csv')

rosters = pd.read_csv(TRAIN_DIR / 'rosters_train.csv')
targets = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv')
scores = pd.read_csv(TRAIN_DIR / 'playerBoxScores_train.csv')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()
seasons = pd.read_csv(MAIN_DATA_DIR / 'seasons.csv')
salaries = pd.read_csv(MAIN_DATA_DIR / 'mlbSalaries.csv')
teams = pd.read_csv(MAIN_DATA_DIR / 'teams.csv')

standings = pd.read_csv(TRAIN_DIR / 'standings_train.csv')
playerTwitterFollowers = pd.read_csv(TRAIN_DIR / 'playerTwitterFollowers_train.csv')

# events = pd.read_csv(TRAIN_DIR / 'events_train.csv')
# events = events.groupby(['gameDate']).sum().reset_index()


In [8]:
train_elements_dict = {"players":players, 
                       "rosters":rosters, 
                       "targets":targets, 
                       "scores":scores, 
                       "seasons":seasons, 
                       "teams":teams, 
                       "standings":standings}

In [9]:
def map_team_name(name):
    names = name.split('-')
    result = ''
    for n in names:
        if n == 'st':
            n = 'st.'
        result += f' {n.capitalize()}'

    return result[1:]

In [10]:
def calc_corr(df):
    # 相関係数行列を作成
    corr_mat = df.corr(method='pearson')

    # 行（列）サイズを取得
    n = corr_mat.shape[0]
    corr_ary = []

    for i in range(n):
        for j in range(i):
            if i == j:
                continue
            corr_ary.append(corr_mat.iloc[i,j])

    return corr_ary

In [11]:
def calc_probs(pid,df,temp):
    to_append=[pid,'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','']
    targets=['target1','target2','target3','target4']
    z=1
    for target in targets:
        target_prob = temp[target].tolist()
        mean = np.mean(target_prob)
        std = np.std(target_prob)
        median = st.median(target_prob)
        distribution = norm(mean, std)
        min_weight = min(target_prob)
        max_weight = max(target_prob)
        values = list(np.linspace(min_weight, max_weight))
        probabilities = [distribution.pdf(v) for v in values]
        max_value = max(probabilities)
        max_index = probabilities.index(max_value)
        to_append[z]=mean
        to_append[z+1]=median
        to_append[z+2]=std
        to_append[z+3]=min_weight
        to_append[z+4]=max_weight
        to_append[z+5]=temp[target].skew()
        to_append[z+6]=temp[target].kurt()

        z=z+7
    corr_ = calc_corr(temp[['target1', 'target2', 'target3', 'target4']])
    to_append[z:] = corr_  
    df_length = len(df)
    df.loc[df_length] = to_append
    return df

In [12]:
def count_consecutive_items_n_cols(df, col_name_list, output_col):
    cum_sum_list = [
        (df[col_name] != df[col_name].shift(1)).cumsum().tolist() for col_name in col_name_list
    ]
    df[output_col] = df.groupby(
        ["_".join(map(str, x)) for x in zip(*cum_sum_list)]
    ).cumcount() + 1
    return df

In [13]:
def extract_season(date_raw, season_start_end):
    idxes = 0
    for raw in season_start_end.iloc():
        idx_ = ((date_raw >= raw.iloc[0]) & (date_raw <= raw.iloc[1])) * 1
        idxes += idx_
    return idxes

In [14]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = lgbm.LGBMRegressor(**params)
    model.fit(x_train, y_train, 
        eval_set=[(x_valid, y_valid)],  
        early_stopping_rounds=verbose, 
        verbose=verbose)
    oof_pred = model.predict(x_valid)
    oof_pred = np.clip(oof_pred, 0, 100)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score

In [15]:
class Rt4kaidoTrain:
    def __init__(self, usetimelinefeature=False):
        
        self.usetimelinefeature = usetimelinefeature
        self.targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
        self.players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight', 'playerForTestSetAndFuturePreds']
        self.rosters_cols = ['playerId', 'teamId', 'status', 'date']
        self.salaries_cols = ['teamId', 'salary', 'year']
        self.standings_cols = ['teamId', 'wildCardRank', 'sportGamesBack', 'date']
        self.transactions_cols = ['playerId', 'transaction_flag', 'date']
        self.stat_cols = ["playerId", "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
                        "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
                        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
                        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt",
                        'tgt1_2_corr', 'tgt1_3_corr', 'tgt2_3_corr', 'tgt1_4_corr', 'tgt2_4_corr', 'tgt3_4_corr']

        self.scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances', 'date']

        self.feature_cols1 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols3 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols4 = ['week_day', 'annual_day', 'month', 'label_playerId', 'label_primaryPositionName', 'label_teamId', 'label_birthCity',
                        'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 
        
        
        self.test_players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight']
        self.test_rosters_cols = ['playerId', 'teamId', 'status']
        self.test_standings_cols = ['teamId', 'wildCardRank', 'sportGamesBack']
        self.test_scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances']

    def make_feature(self, train_elements_dict):

        players = train_elements_dict['players']
        rosters = train_elements_dict['rosters']
        targets = train_elements_dict['targets']
        scores = train_elements_dict['scores']
        seasons = train_elements_dict['seasons']
        teams = train_elements_dict['teams']
        standings = train_elements_dict['standings']

        print('calc target stat ... ', end="")

        ## target stats
        targets_train = targets.merge(rosters[self.rosters_cols], on=['playerId', 'date'], how='left')
        targets_train = targets_train[(targets_train['date'] >= 20210401)]

        playerId_list = targets_train['playerId'].unique()
        player_target_probs = pd.DataFrame(columns = self.stat_cols)  
        for pid in tqdm(playerId_list):
            temp = targets_train[targets_train['playerId'] == pid]
            player_target_stats=calc_probs(pid,player_target_probs,temp)

        teamId_list = targets_train['teamId'].dropna().unique()
        team_target_probs = pd.DataFrame(columns = self.stat_cols)
        for pid in tqdm(teamId_list):
            temp = targets_train[targets_train['teamId'] == pid]
            team_target_stats=calc_probs(pid,team_target_probs,temp)

        team_stat_cols = self.stat_cols
        team_stat_cols = team_stat_cols[:1] + ["team_" + word for word in team_stat_cols[1:]]
        team_target_stats.columns = team_stat_cols

        self.feature_cols1 += self.stat_cols[1:-6]
        self.feature_cols2 += self.stat_cols[1:-6]
        self.feature_cols3 += self.stat_cols[1:-6]
        self.feature_cols4 += self.stat_cols[1:-6]

        self.feature_cols1 += team_stat_cols[1:]
        self.feature_cols2 += team_stat_cols[1:]
        self.feature_cols3 += team_stat_cols[1:]
        self.feature_cols4 += team_stat_cols[1:]

        print('done.')

        print('preprocess ... ', end="")
        ## salaries
        # salaries = salaries.groupby(['year', 'team']).sum()['salary'].reset_index()
        # salaries['team'] = salaries['team'].apply(map_team_name)
        # salaries = salaries.merge(teams, left_on='team', right_on='name', how='inner')
        # salaries = salaries.rename(columns={'id': 'teamId'})

        ## seasons
        seasons = seasons.fillna('0000-00-00')
        for c_ in seasons.columns[1:]:
            seasons[c_] = seasons[c_].str.replace('-', '').astype(int)

        ## players
        players['DOY'] = pd.to_datetime(players['DOB'], format="%Y-%m-%d").dt.year
        players['mlbDebutYear'] = pd.to_datetime(players['mlbDebutDate'], format="%Y-%m-%d").dt.year
        players['DebutAge'] = players['mlbDebutYear'] - players['DOY']

        print('done.')

        print('creat feature ... ', end="")
        # creat feature
        train = targets[self.targets_cols].merge(players[self.players_cols], on=['playerId'], how='left')
        train = train.merge(rosters[self.rosters_cols], on=['playerId', 'date'], how='left')
        train = train.merge(scores[self.scores_cols], on=['playerId', 'date'], how='left')
        train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])
        train = train.merge(standings[self.standings_cols], on=['teamId', 'date'], how='left')
        train = train.merge(team_target_stats, how='left', left_on=["teamId"],right_on=["playerId"], suffixes=('', 'team_'))
        date_ = pd.to_datetime(train['date'], format="%Y%m%d")
        train['annual_day'] = (date_ - pd.to_datetime(date_.dt.year, format="%Y")) /  timedelta(days=1)
        train['week_day'] = date_.dt.weekday
        train['month'] = date_.dt.month
        train['year'] = date_.dt.year

        # label encoding
        player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
        position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
        birthCityn2num = {c: i for i, c in enumerate(train['birthCity'].unique())}
        teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
        status2num = {c: i for i, c in enumerate(train['status'].unique())}
        train['label_playerId'] = train['playerId'].map(player2num)
        train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
        train['label_birthCity'] = train['birthCity'].map(birthCityn2num)
        train['label_teamId'] = train['teamId'].map(teamid2num)
        train['label_status'] = train['status'].map(status2num)

        ## season_info
        on_preseason_idxes = extract_season(train['date'], seasons[['preSeasonStartDate', 'preSeasonEndDate']])
        on_season_idxes = extract_season(train['date'], seasons[['regularSeasonStartDate', 'regularSeasonEndDate']]) * 2
        on_postseason_idxes = extract_season(train['date'], seasons[['postSeasonStartDate', 'postSeasonEndDate']]) * 3

        special_days = seasons['lastDate1stHalf'].to_list() + seasons['allStarDate'].to_list() + seasons['firstDate2ndHalf'].to_list()
        special_idxes = 0
        for day in special_days:
            special_idxes += (train['date'] == day) * 4

        on_total_season_idxes = on_preseason_idxes
        on_total_season_idxes[on_season_idxes==2] = 2
        on_total_season_idxes[on_postseason_idxes==3] = 3
        on_total_season_idxes[special_idxes==4] = 4

        train['season_info'] = on_total_season_idxes

        ## only on season
        on_whole_idxes = extract_season(train['date'], seasons[['seasonStartDate', 'seasonEndDate']])
        train = train[on_whole_idxes == 1].reset_index(drop=True)

        # train = train.merge(playerTwitterFollowers, how='left', on=["playerId", 'date'])

        ## game_info
        train['gameday'] = ~train['battingOrder'].isna()*1
        train.sort_values(by=['playerId','date'],inplace=True,ascending=True)

        train=count_consecutive_items_n_cols(train,['playerId','gameday'],'daysSinceLastGame')
        train.loc[train['gameday']==1,'daysSinceLastGame'] = 0

        train_game = train[train['gameday']==1]
        train_last_game = train_game[~train_game.duplicated(subset='playerId', keep='last')][['playerId', 'date']]
        train_last_game.columns = ['playerId', 'lastdate']
        train_player_unique = pd.DataFrame(train['playerId'].unique(), columns=['playerId'])
        train_last_game = pd.merge(train_player_unique, train_last_game, on=['playerId'], how='left' )
        train_last_game = train_last_game.fillna(20171231)

        ## only test_player
        train = train[train['playerForTestSetAndFuturePreds']==True].reset_index(drop=True)

        print('done.')
        
        train_features_dict = {'player_target_stats': player_target_stats,
                                'team_target_stats': team_target_stats,
                                'train_last_game': train_last_game, 
                                'player2num': player2num, 
                                'position2num': position2num, 
                                'birthCityn2num': birthCityn2num,
                                'teamid2num': teamid2num,
                                'status2num': status2num,
                                'feature_cols1': self.feature_cols1,
                                'feature_cols2': self.feature_cols2,
                                'feature_cols3': self.feature_cols3,
                                'feature_cols4': self.feature_cols4
                              }
        
        if self.usetimelinefeature:
            train_features_dict['train_last_game'] = train_last_game
            self.feature_cols1 += ['daysSinceLastGame']
            self.feature_cols2 += ['daysSinceLastGame']
            self.feature_cols3 += ['daysSinceLastGame']
            self.feature_cols4 += ['daysSinceLastGame']

        return train, train_features_dict
    
    def train_and_evaluate(self, train, isgamedayonly=False):
        
        if isgamedayonly:
            train = train[train['gameday'] == 1].reset_index(drop=True)
            
        train_X = train
        train_y = train[['target1', 'target2', 'target3', 'target4']]

        oof = np.zeros(train_y.shape) - 1.0
        y_valids = np.zeros(train_y.shape) - 1.0

        tr_idx = (train['date'].astype(int) < 20210401)
        val_idx = ~tr_idx

        x_train = train_X.loc[tr_idx].reset_index(drop=True)
        y_train = train_y.loc[tr_idx].reset_index(drop=True)
        x_valid = train_X.loc[val_idx].reset_index(drop=True)
        y_valid = train_y.loc[val_idx].reset_index(drop=True)


        oof1, model1, score1 = fit_lgbm(
            x_train[self.feature_cols1], y_train['target1'],
            x_valid[self.feature_cols1], y_valid['target1'],
            params1
        )
        oof2, model2, score2 = fit_lgbm(
            x_train[self.feature_cols2], y_train['target2'],
            x_valid[self.feature_cols2], y_valid['target2'],
            params2
        )
        oof3, model3, score3 = fit_lgbm(
            x_train[self.feature_cols3], y_train['target3'],
            x_valid[self.feature_cols3], y_valid['target3'],
            params3
        )
        oof4, model4, score4 = fit_lgbm(
            x_train[self.feature_cols4], y_train['target4'],
            x_valid[self.feature_cols4], y_valid['target4'],
            params4
        )

        score = (score1+score2+score3+score4) / 4
        print(f'score: {score}')

        oof[val_idx, 0] = oof1
        oof[val_idx, 1] = oof2
        oof[val_idx, 2] = oof3
        oof[val_idx, 3] = oof4
        y_valids[val_idx, 0] = y_valid['target1'].values
        y_valids[val_idx, 1] = y_valid['target2'].values
        y_valids[val_idx, 2] = y_valid['target3'].values
        y_valids[val_idx, 3] = y_valid['target4'].values

        mae = mean_absolute_error(y_valids[val_idx, :], oof[val_idx, :])
        print("mae:", mae)

        val_idx_num = val_idx[val_idx==True].index.to_list()

        oof_df = train[self.targets_cols]
        oof_df.iloc[val_idx_num, 1:5] = oof[val_idx_num, :]

        models = np.array([model1, model2, model3, model4])

        return oof_df.iloc[val_idx_num], models
            

In [16]:
class Rt4kaidoTest:
    def __init__(self, train_features_dict, models_notgameday, models_gameday, usetimelinefeature=False):
        
        self.usetimelinefeature = usetimelinefeature
        self.train_features_dict = train_features_dict
        self.feature_cols1 = train_features_dict['feature_cols1']
        self.feature_cols2 = train_features_dict['feature_cols2']
        self.feature_cols3 = train_features_dict['feature_cols3']
        self.feature_cols4 = train_features_dict['feature_cols4']
        self.models_notgameday = models_notgameday
        self.models_gameday = models_gameday


        self.test_players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight']
        self.test_rosters_cols = ['playerId', 'teamId', 'status']
        self.test_standings_cols = ['teamId', 'wildCardRank', 'sportGamesBack']
        self.test_scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances']

    def test_oneline(self, test_df, sample_prediction_df):
        
        null = np.nan
        true = True
        false = False
        
        sample_prediction_df = sample_prediction_df.reset_index(drop=True)

        # creat dataset
        sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                            .map(lambda x: int(x.split('_')[1]))
        # Dealing with missing values
        if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
            test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
        else:
            test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
            for col in rosters.columns:
                if col == 'playerId': continue
                test_rosters[col] = np.nan

        if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
            test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
        else:
            test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
            for col in scores.columns:
                if col == 'playerId': continue
                test_scores[col] = np.nan

        if test_df['standings'].iloc[0] == test_df['standings'].iloc[0]:
            test_standings = pd.DataFrame(eval(test_df['standings'].iloc[0]))
        else:
            test_standings = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
            for col in standings.columns:
                if col == 'playerId': continue
                test_standings[col] = np.nan

        test_scores = test_scores.groupby('playerId').sum().reset_index()
        test = sample_prediction_df[['playerId']].copy()
        test = test.merge(players[self.test_players_cols], on='playerId', how='left')
        test = test.merge(test_rosters[self.test_rosters_cols], on='playerId', how='left')
        test = test.merge(test_scores[self.test_scores_cols], on='playerId', how='left')
        test = test.merge(self.train_features_dict['player_target_stats'], how='left', left_on=["playerId"],right_on=["playerId"])
        test = test.merge(test_standings[self.test_standings_cols], on='teamId', how='left')
        test = test.merge(self.train_features_dict['team_target_stats'], how='left', left_on=["teamId"],right_on=["playerId"], suffixes=('', 'team_'))
        test['wildCardRank'] = test['wildCardRank'].astype(float)


        test['label_playerId'] = test['playerId'].map(self.train_features_dict['player2num'])
        test['label_primaryPositionName'] = test['primaryPositionName'].map(self.train_features_dict['position2num'])
        test['label_teamId'] = test['teamId'].map(self.train_features_dict['teamid2num'])
        test['label_status'] = test['status'].map(self.train_features_dict['status2num'])
        test['label_birthCity'] = test['birthCity'].map(self.train_features_dict['birthCityn2num'])

        date_ = pd.to_datetime(test_df.index[0], format="%Y%m%d")
        test['annual_day'] = (date_ - pd.to_datetime(date_.year, format="%Y")) /  timedelta(days=1)
        test['week_day'] = date_.weekday()
        test['month'] = date_.month
        test['season_info'] = 2
        test['gameday'] = ~test['battingOrder'].isna()*1
        
        if self.usetimelinefeature:
            test['date'] = test_df.index[0]
            test = pd.merge(test, self.train_features_dict['train_last_game'], on=['playerId'], how='left')
            test['daysSinceLastGame'] = (pd.to_datetime(test['date'], format="%Y%m%d") - pd.to_datetime(test['lastdate'], format="%Y%m%d")).dt.days
            
            self.train_features_dict['train_last_game'] = pd.merge(self.train_features_dict['train_last_game'], test[test['gameday']==1][['playerId','date']], on=['playerId'], how='left')
            self.train_features_dict['train_last_game']['lastdate'].update(self.train_features_dict['train_last_game']['date'])
            self.train_features_dict['train_last_game'] = self.train_features_dict['train_last_game'][['playerId', 'lastdate']]

        test_gameday = test[test['gameday']==1]

        if len(test_gameday) != 0:
            gameday_index = list(test_gameday.index)

            test_X = test.iloc[gameday_index]

            pred1 = self.models_gameday[0].predict(test_X[self.feature_cols1])
            pred2 = self.models_gameday[1].predict(test_X[self.feature_cols2])
            pred3 = self.models_gameday[2].predict(test_X[self.feature_cols3])
            pred4 = self.models_gameday[3].predict(test_X[self.feature_cols4])

            # merge submission
            sample_prediction_df['target1'].iloc[gameday_index] = np.clip(pred1, 0, 100)
            sample_prediction_df['target2'].iloc[gameday_index] = np.clip(pred2, 0, 100)
            sample_prediction_df['target3'].iloc[gameday_index] = np.clip(pred3, 0, 100)
            sample_prediction_df['target4'].iloc[gameday_index] = np.clip(pred4, 0, 100)

        # ------------------------------------------------------------

        test_notgameday = test[test['gameday']==0]
        if len(test_notgameday) != 0:
            notgameday_index = list(test_notgameday.index)

            test_X = test.iloc[notgameday_index]

            pred1 = self.models_notgameday[0].predict(test_X[self.feature_cols1])
            pred2 = self.models_notgameday[1].predict(test_X[self.feature_cols2])
            pred3 = self.models_notgameday[2].predict(test_X[self.feature_cols3])
            pred4 = self.models_notgameday[3].predict(test_X[self.feature_cols4])

            # merge submission
            sample_prediction_df['target1'].iloc[notgameday_index] = np.clip(pred1, 0, 100)
            sample_prediction_df['target2'].iloc[notgameday_index] = np.clip(pred2, 0, 100)
            sample_prediction_df['target3'].iloc[notgameday_index] = np.clip(pred3, 0, 100)
            sample_prediction_df['target4'].iloc[notgameday_index] = np.clip(pred4, 0, 100)

        sample_prediction_df = sample_prediction_df.fillna(0.)

        del sample_prediction_df['playerId']
        
        return sample_prediction_df

In [17]:
rt4kaido_train = Rt4kaidoTrain(usetimelinefeature=True)
train, train_features_dict = rt4kaido_train.make_feature(train_elements_dict)

calc target stat ... 

100%|██████████| 2061/2061 [00:46<00:00, 44.72it/s]
100%|██████████| 30/30 [00:00<00:00, 42.13it/s]


done.
preprocess ... done.
creat feature ... done.


In [18]:
oof_df, models = rt4kaido_train.train_and_evaluate(train, isgamedayonly=False)
# oof_df.to_csv(OUTPUT_DIR / f'oof{EXP_NUM}.csv')
# with open(OUTPUT_DIR / f"models{EXP_NUM}.pickle", mode="wb") as f:
#     pickle.dump(models, f)
    
oof_df_gameday, models_gameday = rt4kaido_train.train_and_evaluate(train, isgamedayonly=True)
# oof_df_gameday.to_csv(OUTPUT_DIR / f'oof{EXP_NUM}_gameday.csv')
# with open(OUTPUT_DIR / f"models{EXP_NUM}_gameday.pickle", mode="wb") as f:
#     pickle.dump(models_gameday, f)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 1.20886
[200]	valid_0's l1: 1.19488
[300]	valid_0's l1: 1.18845
[400]	valid_0's l1: 1.18077
[500]	valid_0's l1: 1.17976
[600]	valid_0's l1: 1.17615
[700]	valid_0's l1: 1.17317
[800]	valid_0's l1: 1.16794
[900]	valid_0's l1: 1.16595
[1000]	valid_0's l1: 1.16302
[1100]	valid_0's l1: 1.16071
[1200]	valid_0's l1: 1.15883
[1300]	valid_0's l1: 1.15789
[1400]	valid_0's l1: 1.15745
[1500]	valid_0's l1: 1.15586
[1600]	valid_0's l1: 1.15554
[1700]	valid_0's l1: 1.15331
[1800]	valid_0's l1: 1.15186
Early stopping, best iteration is:
[1762]	valid_0's l1: 1.15185
mae: 1.1516272600241766
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 2.26423
[200]	valid_0's l1: 2.25831
[300]	valid_0's l1: 2.25867
[400]	valid_0's l1: 2.2584
Early stopping, best iteration is:
[319]	valid_0's l1: 2.25335
mae: 2.2531645313764117
Training until validation scores don't improve for 100 rounds
[100]	valid_0's

In [19]:
del train, rt4kaido_train

In [20]:
rt4kaido_test = Rt4kaidoTest(train_features_dict, models, models_gameday, usetimelinefeature=True)

## テストで取ってこれる一行はこんな感じ

In [21]:
example_sample_submission = pd.read_csv(MAIN_DATA_DIR / "example_sample_submission.csv")
example_test = pd.read_csv(MAIN_DATA_DIR / "example_test.csv")
test_df = example_test.set_index('date').iloc[:1]
sample_prediction_df = example_sample_submission[example_sample_submission['date']==test_df.index[0]].set_index('date')

In [22]:
sample_prediction_df = rt4kaido_test.test_oneline(test_df, sample_prediction_df)

In [23]:
sample_prediction_df

Unnamed: 0,date_playerId,target1,target2,target3,target4
0,20210427_656669,2.751419e+00,5.257084,4.098298e-02,1.409510
1,20210427_543475,2.013820e-01,1.930510,1.073928e+00,0.642219
2,20210427_592866,1.153860e-01,1.522780,4.494049e-02,0.804020
3,20210427_452678,2.512535e-01,3.218205,2.476781e-02,1.772659
4,20210427_570257,7.320239e-04,0.585323,3.939177e-03,0.224129
...,...,...,...,...,...
1182,20210427_593590,9.699521e-35,0.006127,1.290484e-21,0.000000
1183,20210427_642180,1.497223e-02,1.054770,4.181164e-02,0.544645
1184,20210427_663399,3.589244e-03,0.185310,0.000000e+00,0.074401
1185,20210427_664199,6.254566e-03,0.697302,2.647472e-03,0.295071
