In [1]:
import gc
import os
import sys
import math
import random
import warnings
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)
import optuna

import pickle
from tqdm import tqdm
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# from pandarallel import pandarallel
# pandarallel.initialize()
warnings.simplefilter("ignore")
import ctypes as ct
from datetime import timedelta

from sklearn.metrics import mean_absolute_error

import statistics as st
import lightgbm as lgbm
from scipy.stats import norm

In [2]:
sys.path.append('../../')
import src.utils as utils

## Param

In [3]:
EXP_NUM = 72
NFOLDS = 5
SEED = 42

In [4]:
# def set_seed(seed: int = 42):
#     random.seed(seed)
#     np.random.seed(seed)
#     os.environ["PYTHONHASHSEED"] = str(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)  # type: ignore
#     torch.backends.cudnn.deterministic = True  # type: ignore
#     torch.backends.cudnn.benchmark = False  # type: ignore
# set_seed(SEED)

## Dir

In [5]:
DATA_DIR = Path("/home/knikaido/work/MLB-Player-Digital-Engagement-Forecasting/data/")
MAIN_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting'
TRAIN_DIR = MAIN_DATA_DIR / 'train'
OUTPUT_DIR = Path('./output/')

In [6]:
players = pd.read_csv(MAIN_DATA_DIR / 'players.csv')

rosters = pd.read_csv(TRAIN_DIR / 'rosters_train.csv')
targets = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv')
scores = pd.read_csv(TRAIN_DIR / 'playerBoxScores_train.csv')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()
seasons = pd.read_csv(MAIN_DATA_DIR / 'seasons.csv')
salaries = pd.read_csv(MAIN_DATA_DIR / 'mlbSalaries.csv')
teams = pd.read_csv(MAIN_DATA_DIR / 'teams.csv')

standings = pd.read_csv(TRAIN_DIR / 'standings_train.csv')
playerTwitterFollowers = pd.read_csv(TRAIN_DIR / 'playerTwitterFollowers_train.csv')

# events = pd.read_csv(TRAIN_DIR / 'events_train.csv')
# events = events.groupby(['gameDate']).sum().reset_index()


In [7]:
train_elements_dict = {"players":players, 
                       "rosters":rosters, 
                       "targets":targets, 
                       "scores":scores, 
                       "seasons":seasons, 
                       "teams":teams, 
                       "standings":standings}

In [8]:
def map_team_name(name):
    names = name.split('-')
    result = ''
    for n in names:
        if n == 'st':
            n = 'st.'
        result += f' {n.capitalize()}'

    return result[1:]

In [9]:
def calc_corr(df):
    # 相関係数行列を作成
    corr_mat = df.corr(method='pearson')

    # 行（列）サイズを取得
    n = corr_mat.shape[0]
    corr_ary = []

    for i in range(n):
        for j in range(i):
            if i == j:
                continue
            corr_ary.append(corr_mat.iloc[i,j])

    return corr_ary

In [10]:
def calc_probs(pid,df,temp):
    to_append=[pid,'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','']
    targets=['target1','target2','target3','target4']
    z=1
    for target in targets:
        target_prob = temp[target].tolist()
        mean = np.mean(target_prob)
        std = np.std(target_prob)
        median = st.median(target_prob)
        distribution = norm(mean, std)
        min_weight = min(target_prob)
        max_weight = max(target_prob)
        values = list(np.linspace(min_weight, max_weight))
        probabilities = [distribution.pdf(v) for v in values]
        max_value = max(probabilities)
        max_index = probabilities.index(max_value)
        to_append[z]=mean
        to_append[z+1]=median
        to_append[z+2]=std
        to_append[z+3]=min_weight
        to_append[z+4]=max_weight
        to_append[z+5]=temp[target].skew()
        to_append[z+6]=temp[target].kurt()

        z=z+7
    corr_ = calc_corr(temp[['target1', 'target2', 'target3', 'target4']])
    to_append[z:] = corr_  
    df_length = len(df)
    df.loc[df_length] = to_append
    return df

In [11]:
def count_consecutive_items_n_cols(df, col_name_list, output_col):
    cum_sum_list = [
        (df[col_name] != df[col_name].shift(1)).cumsum().tolist() for col_name in col_name_list
    ]
    df[output_col] = df.groupby(
        ["_".join(map(str, x)) for x in zip(*cum_sum_list)]
    ).cumcount() + 1
    return df

In [12]:
def extract_season(date_raw, season_start_end):
    idxes = 0
    for raw in season_start_end.iloc():
        idx_ = ((date_raw >= raw.iloc[0]) & (date_raw <= raw.iloc[1])) * 1
        idxes += idx_
    return idxes

In [13]:
# def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
#     oof_pred = np.zeros(len(y_valid), dtype=np.float32)
#     model = lgbm.LGBMRegressor(**params)
#     model.fit(x_train, y_train, 
#         eval_set=[(x_valid, y_valid)],  
#         early_stopping_rounds=verbose, 
#         verbose=verbose)
#     oof_pred = model.predict(x_valid)
#     oof_pred = np.clip(oof_pred, 0, 100)
#     score = mean_absolute_error(oof_pred, y_valid)
#     print('mae:', score)
#     return oof_pred, model, score

In [14]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, verbose=100):
    def opt(trial):
        params = {
                'random_state': SEED,
                'objective':'mae',
                'n_estimators': 10000,
                'learning_rate': 0.1,
                'max_depth': trial.suggest_int('max_depth', 1, 20),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
                'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
                'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1),
                'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 1e3),
                'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 1e3),
                'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
                'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
                'bagging_freq': trial.suggest_int('bagging_freq', 1, 20),
                'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
        }

        model_opt = lgbm.LGBMRegressor(**params)

        model_opt.fit(x_train, y_train, 
            eval_set=[(x_train, y_train), (x_valid, y_valid)],  
            early_stopping_rounds=verbose, 
            verbose=verbose)
        oof_pred = model_opt.predict(x_valid)
        oof_pred = np.clip(oof_pred, 0, 100)
        score = mean_absolute_error(oof_pred, y_valid)
        return -score
    return opt

In [15]:
def my_timeseries_fold(train):
    
    tr_idxs = []
    val_idxs = []
    
    tr_idx = (train['date'].astype(int) < 20200801)
    val_idx = (train['date'].astype(int) >= 20200801) & (train['date'].astype(int) < 20200901)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)

    tr_idx = (train['date'].astype(int) < 20200901)
    val_idx = (train['date'].astype(int) >= 20200901) & (train['date'].astype(int) < 20201001)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)

    tr_idx = (train['date'].astype(int) < 20201001)
    val_idx = (train['date'].astype(int) >= 20201001) & (train['date'].astype(int) < 20201028)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)

    tr_idx = (train['date'].astype(int) < 20210228)
    val_idx = (train['date'].astype(int) >= 20210228) & (train['date'].astype(int) < 20210401)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)

    tr_idx = (train['date'].astype(int) < 20210401)
    val_idx = ~tr_idx
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)
    
    return tr_idxs, val_idxs

In [16]:
class Rt4kaidoTrain:
    def __init__(self, usetimelinefeature=False):
        
        self.usetimelinefeature = usetimelinefeature
        self.targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
        self.players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight', 'playerForTestSetAndFuturePreds']
        self.rosters_cols = ['playerId', 'teamId', 'status', 'date']
        self.salaries_cols = ['teamId', 'salary', 'year']
        self.standings_cols = ['teamId', 'wildCardRank', 'sportGamesBack', 'date']
        self.transactions_cols = ['playerId', 'transaction_flag', 'date']
        self.stat_cols = ["playerId", "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
                        "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
                        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
                        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt",
                        'tgt1_2_corr', 'tgt1_3_corr', 'tgt2_3_corr', 'tgt1_4_corr', 'tgt2_4_corr', 'tgt3_4_corr']

        self.scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances', 'date']

        self.feature_cols1 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols3 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols4 = ['week_day', 'annual_day', 'month', 'label_playerId', 'label_primaryPositionName', 'label_teamId', 'label_birthCity',
                        'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 
        
        
        # lightgbm
        self.params1 = {'objective':'mae',
                       'reg_alpha': 0.14947461820098767, 
                       'reg_lambda': 0.10185644384043743, 
                       'n_estimators': 3633, 
                       'learning_rate': 0.08046301304430488, 
                       'num_leaves': 674, 
                       'feature_fraction': 0.9101240539122566, 
                       'bagging_fraction': 0.9884451442950513, 
                       'bagging_freq': 8, 
                       'min_child_samples': 51}


        self.params2 = {'objective':'mae',
                       'reg_alpha': 0.1,
                       'reg_lambda': 0.1, 
                       'n_estimators': 80,
                       'learning_rate': 0.1,
                       'random_state': 42,
                       "num_leaves": 22}



        self.params3 = {'objective':'mae',
                       'reg_alpha': 0.1,
                       'reg_lambda': 0.1, 
                       'n_estimators': 10000,
                       'learning_rate': 0.1,
                       'random_state': 42,
                       "num_leaves": 100}

        self.params4 = {'objective':'mae',
                       'reg_alpha': 0.016468100279441976, 
                       'reg_lambda': 0.09128335764019105, 
                       'n_estimators': 9868, 
                       'learning_rate': 0.10528150510326864, 
                       'num_leaves': 157, 
                       'feature_fraction': 0.5419185713426886, 
                       'bagging_fraction': 0.2637405128936662, 
                       'bagging_freq': 19, 
                       'min_child_samples': 71}

    def make_feature(self, train_elements_dict):

        players = train_elements_dict['players']
        rosters = train_elements_dict['rosters']
        targets = train_elements_dict['targets']
        scores = train_elements_dict['scores']
        seasons = train_elements_dict['seasons']
        teams = train_elements_dict['teams']
        standings = train_elements_dict['standings']

        print('calc target stat ... ', end="")

        ## target stats
        targets_train = targets.merge(rosters[self.rosters_cols], on=['playerId', 'date'], how='left')
        targets_train = targets_train[(targets_train['date'] >= 20210401)]

        playerId_list = targets_train['playerId'].unique()
        player_target_probs = pd.DataFrame(columns = self.stat_cols)  
        for pid in tqdm(playerId_list):
            temp = targets_train[targets_train['playerId'] == pid]
            player_target_stats=calc_probs(pid,player_target_probs,temp)

        teamId_list = targets_train['teamId'].dropna().unique()
        team_target_probs = pd.DataFrame(columns = self.stat_cols)
        for pid in tqdm(teamId_list):
            temp = targets_train[targets_train['teamId'] == pid]
            team_target_stats=calc_probs(pid,team_target_probs,temp)

        team_stat_cols = self.stat_cols
        team_stat_cols = team_stat_cols[:1] + ["team_" + word for word in team_stat_cols[1:]]
        team_target_stats.columns = team_stat_cols

        self.feature_cols1 += self.stat_cols[1:-6]
        self.feature_cols2 += self.stat_cols[1:-6]
        self.feature_cols3 += self.stat_cols[1:-6]
        self.feature_cols4 += self.stat_cols[1:-6]

        self.feature_cols1 += team_stat_cols[1:]
        self.feature_cols2 += team_stat_cols[1:]
        self.feature_cols3 += team_stat_cols[1:]
        self.feature_cols4 += team_stat_cols[1:]

        print('done.')

        print('preprocess ... ', end="")
        ## salaries
        # salaries = salaries.groupby(['year', 'team']).sum()['salary'].reset_index()
        # salaries['team'] = salaries['team'].apply(map_team_name)
        # salaries = salaries.merge(teams, left_on='team', right_on='name', how='inner')
        # salaries = salaries.rename(columns={'id': 'teamId'})

        ## seasons
        seasons = seasons.fillna('0000-00-00')
        for c_ in seasons.columns[1:]:
            seasons[c_] = seasons[c_].str.replace('-', '').astype(int)

        ## players
        players['DOY'] = pd.to_datetime(players['DOB'], format="%Y-%m-%d").dt.year
        players['mlbDebutYear'] = pd.to_datetime(players['mlbDebutDate'], format="%Y-%m-%d").dt.year
        players['DebutAge'] = players['mlbDebutYear'] - players['DOY']

        print('done.')

        print('creat feature ... ', end="")
        # creat feature
        train = targets[self.targets_cols].merge(players[self.players_cols], on=['playerId'], how='left')
        train = train.merge(rosters[self.rosters_cols], on=['playerId', 'date'], how='left')
        train = train.merge(scores[self.scores_cols], on=['playerId', 'date'], how='left')
        train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])
        train = train.merge(standings[self.standings_cols], on=['teamId', 'date'], how='left')
        train = train.merge(team_target_stats, how='left', left_on=["teamId"],right_on=["playerId"], suffixes=('', 'team_'))
        date_ = pd.to_datetime(train['date'], format="%Y%m%d")
        train['annual_day'] = (date_ - pd.to_datetime(date_.dt.year, format="%Y")) /  timedelta(days=1)
        train['week_day'] = date_.dt.weekday
        train['month'] = date_.dt.month
        train['year'] = date_.dt.year

        # label encoding
        player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
        position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
        birthCityn2num = {c: i for i, c in enumerate(train['birthCity'].unique())}
        teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
        status2num = {c: i for i, c in enumerate(train['status'].unique())}
        train['label_playerId'] = train['playerId'].map(player2num)
        train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
        train['label_birthCity'] = train['birthCity'].map(birthCityn2num)
        train['label_teamId'] = train['teamId'].map(teamid2num)
        train['label_status'] = train['status'].map(status2num)

        ## season_info
        on_preseason_idxes = extract_season(train['date'], seasons[['preSeasonStartDate', 'preSeasonEndDate']])
        on_season_idxes = extract_season(train['date'], seasons[['regularSeasonStartDate', 'regularSeasonEndDate']]) * 2
        on_postseason_idxes = extract_season(train['date'], seasons[['postSeasonStartDate', 'postSeasonEndDate']]) * 3

        special_days = seasons['lastDate1stHalf'].to_list() + seasons['allStarDate'].to_list() + seasons['firstDate2ndHalf'].to_list()
        special_idxes = 0
        for day in special_days:
            special_idxes += (train['date'] == day) * 4

        on_total_season_idxes = on_preseason_idxes
        on_total_season_idxes[on_season_idxes==2] = 2
        on_total_season_idxes[on_postseason_idxes==3] = 3
        on_total_season_idxes[special_idxes==4] = 4

        train['season_info'] = on_total_season_idxes

        ## only on season
        on_whole_idxes = extract_season(train['date'], seasons[['seasonStartDate', 'seasonEndDate']])
        train = train[on_whole_idxes == 1].reset_index(drop=True)

        # train = train.merge(playerTwitterFollowers, how='left', on=["playerId", 'date'])

        ## only test_player
        train = train[train['playerForTestSetAndFuturePreds']==True].reset_index(drop=True)

        print('done.')
        
        train_features_dict = {'players': players,
                                'player_target_stats': player_target_stats,
                                'team_target_stats': team_target_stats,
                                'player2num': player2num, 
                                'position2num': position2num, 
                                'birthCityn2num': birthCityn2num,
                                'teamid2num': teamid2num,
                                'status2num': status2num,
                                'feature_cols1': self.feature_cols1,
                                'feature_cols2': self.feature_cols2,
                                'feature_cols3': self.feature_cols3,
                                'feature_cols4': self.feature_cols4
                              }
        
        if self.usetimelinefeature:
            ## game_info
            train['gameday'] = ~train['battingOrder'].isna()*1
            train.sort_values(by=['playerId','date'],inplace=True,ascending=True)

            train=count_consecutive_items_n_cols(train,['playerId','gameday'],'daysSinceLastGame')
            train.loc[train['gameday']==1,'daysSinceLastGame'] = 0

            train_game = train[train['gameday']==1]
            train_last_game = train_game[~train_game.duplicated(subset='playerId', keep='last')][['playerId', 'date']]
            train_last_game.columns = ['playerId', 'lastdate']
            train_player_unique = pd.DataFrame(train['playerId'].unique(), columns=['playerId'])
            train_last_game = pd.merge(train_player_unique, train_last_game, on=['playerId'], how='left' )
            train_last_game = train_last_game.fillna(20171231)
            
            train_features_dict['train_last_game'] = train_last_game
            self.feature_cols1 += ['daysSinceLastGame']
            self.feature_cols2 += ['daysSinceLastGame']
            self.feature_cols3 += ['daysSinceLastGame']
            self.feature_cols4 += ['daysSinceLastGame']
            
            
            ## rosters_info
            train['rosterday'] = ~train['status'].isna()*1
            train.sort_values(by=['playerId','date'],inplace=True,ascending=True)

            train=count_consecutive_items_n_cols(train,['playerId','rosterday'],'daysSinceLastRoster')
            train.loc[train['rosterday']==1,'daysSinceLastRoster'] = 0

            train_roster= train[train['rosterday']==1]
            train_last_roster = train_roster[~train_roster.duplicated(subset='playerId', keep='last')][['playerId', 'date']]
            train_last_roster.columns = ['playerId', 'lastroster']
            train_player_unique = pd.DataFrame(train['playerId'].unique(), columns=['playerId'])
            train_last_roster = pd.merge(train_player_unique, train_last_roster, on=['playerId'], how='left' )
            train_last_roster = train_last_roster.fillna(20171231)
            
            train_features_dict['train_last_roster'] = train_last_roster
            self.feature_cols1 += ['daysSinceLastRoster']
            self.feature_cols2 += ['daysSinceLastRoster']
            self.feature_cols3 += ['daysSinceLastRoster']
            self.feature_cols4 += ['daysSinceLastRoster']


        return train, train_features_dict

    
    def train_and_evaluate(self, train, isgamedayonly=False):
        
        if isgamedayonly:
            train = train[train['gameday'] == 1].reset_index(drop=True)
            
        train_X = train
        train_y = train[['target1', 'target2', 'target3', 'target4']]

        oof = np.zeros(train_y.shape) - 1.0
        y_valids = np.zeros(train_y.shape) - 1.0

        tr_idx = (train['date'].astype(int) < 20210401)
        val_idx = ~tr_idx

        x_train = train_X.loc[tr_idx].reset_index(drop=True)
        y_train = train_y.loc[tr_idx].reset_index(drop=True)
        x_valid = train_X.loc[val_idx].reset_index(drop=True)
        y_valid = train_y.loc[val_idx].reset_index(drop=True)

        study1 = optuna.create_study(direction='maximize')
        study1.optimize(fit_lgbm(x_train[self.feature_cols1], y_train['target1'], 
                                 x_valid[self.feature_cols1], y_valid['target1']), n_trials=100)
        
        study2 = optuna.create_study(direction='maximize')
        study2.optimize(fit_lgbm(x_train[self.feature_cols2], y_train['target2'], 
                                 x_valid[self.feature_cols2], y_valid['target2']), n_trials=100)
        
        study3 = optuna.create_study(direction='maximize')
        study3.optimize(fit_lgbm(x_train[self.feature_cols3], y_train['target3'], 
                                 x_valid[self.feature_cols3], y_valid['target3']), n_trials=100)
        
        study4 = optuna.create_study(direction='maximize')
        study4.optimize(fit_lgbm(x_train[self.feature_cols4], y_train['target4'], 
                                 x_valid[self.feature_cols4], y_valid['target4']), n_trials=100)

#         oof1, model1, score1 = fit_lgbm(
#             x_train[self.feature_cols1], y_train['target1'],
#             x_valid[self.feature_cols1], y_valid['target1'],
# #             self.params1
#         )
#         oof2, model2, score2 = fit_lgbm(
#             x_train[self.feature_cols2], y_train['target2'],
#             x_valid[self.feature_cols2], y_valid['target2'],
# #             self.params2
#         )
#         oof3, model3, score3 = fit_lgbm(
#             x_train[self.feature_cols3], y_train['target3'],
#             x_valid[self.feature_cols3], y_valid['target3'],
# #             self.params3
#         )
#         oof4, model4, score4 = fit_lgbm(
#             x_train[self.feature_cols4], y_train['target4'],
#             x_valid[self.feature_cols4], y_valid['target4'],
# #             self.params4
#         )

#         score = (score1+score2+score3+score4) / 4
#         print(f'score: {score}')

#         oof[val_idx, 0] = oof1
#         oof[val_idx, 1] = oof2
#         oof[val_idx, 2] = oof3
#         oof[val_idx, 3] = oof4
#         y_valids[val_idx, 0] = y_valid['target1'].values
#         y_valids[val_idx, 1] = y_valid['target2'].values
#         y_valids[val_idx, 2] = y_valid['target3'].values
#         y_valids[val_idx, 3] = y_valid['target4'].values

#         mae = mean_absolute_error(y_valids[val_idx, :], oof[val_idx, :])
#         print("mae:", mae)

#         val_idx_num = val_idx[val_idx==True].index.to_list()

#         oof_df = train[self.targets_cols]
#         oof_df.iloc[val_idx_num, 1:5] = oof[val_idx_num, :]

#         models = np.array([model1, model2, model3, model4])

        return study1, study2, study3, study4
            

In [17]:
rt4kaido_train = Rt4kaidoTrain(usetimelinefeature=True)
train, train_features_dict = rt4kaido_train.make_feature(train_elements_dict)

calc target stat ... 

100%|██████████| 2061/2061 [00:45<00:00, 44.99it/s]
100%|██████████| 30/30 [00:00<00:00, 42.76it/s]


done.
preprocess ... done.
creat feature ... done.


In [None]:
study1, study2, study3, study4 = rt4kaido_train.train_and_evaluate(train, isgamedayonly=False)

[32m[I 2021-07-16 03:09:40,692][0m A new study created in memory with name: no-name-a1024e13-2096-4686-b713-f11d18c9d26d[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.825425	valid_1's l1: 1.15231
[200]	training's l1: 0.814252	valid_1's l1: 1.15372


[32m[I 2021-07-16 03:09:55,461][0m Trial 0 finished with value: -1.1505835553561288 and parameters: {'max_depth': 19, 'min_child_weight': 10, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_lambda': 1.8874891351227971, 'reg_alpha': 0.001548272148361639, 'feature_fraction': 0.7951806324801636, 'bagging_fraction': 0.2897145656576592, 'bagging_freq': 18, 'num_leaves': 877, 'min_child_samples': 59}. Best is trial 0 with value: -1.1505835553561288.[0m


Early stopping, best iteration is:
[126]	training's l1: 0.821731	valid_1's l1: 1.15118
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.887615	valid_1's l1: 1.2165
[200]	training's l1: 0.878817	valid_1's l1: 1.2003
[300]	training's l1: 0.874556	valid_1's l1: 1.19286
[400]	training's l1: 0.872271	valid_1's l1: 1.18837
[500]	training's l1: 0.867641	valid_1's l1: 1.17771
[600]	training's l1: 0.86616	valid_1's l1: 1.17434
[700]	training's l1: 0.863119	valid_1's l1: 1.16698
[800]	training's l1: 0.860379	valid_1's l1: 1.16223
[900]	training's l1: 0.859408	valid_1's l1: 1.16141
[1000]	training's l1: 0.854568	valid_1's l1: 1.15785
[1100]	training's l1: 0.85408	valid_1's l1: 1.15701
[1200]	training's l1: 0.853	valid_1's l1: 1.15467
Early stopping, best iteration is:
[1177]	training's l1: 0.853097	valid_1's l1: 1.15459


[32m[I 2021-07-16 03:10:12,375][0m Trial 1 finished with value: -1.1543661363292903 and parameters: {'max_depth': 4, 'min_child_weight': 9, 'subsample': 0.6, 'colsample_bytree': 0.7, 'reg_lambda': 0.15903845377103035, 'reg_alpha': 0.025094283490017345, 'feature_fraction': 0.8721169342296915, 'bagging_fraction': 0.36949839159289977, 'bagging_freq': 19, 'num_leaves': 497, 'min_child_samples': 72}. Best is trial 0 with value: -1.1505835553561288.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.927376	valid_1's l1: 1.27886
[200]	training's l1: 0.921094	valid_1's l1: 1.26575
[300]	training's l1: 0.916696	valid_1's l1: 1.25627
[400]	training's l1: 0.914103	valid_1's l1: 1.24998
[500]	training's l1: 0.913053	valid_1's l1: 1.24819
[600]	training's l1: 0.911723	valid_1's l1: 1.24508
[700]	training's l1: 0.911097	valid_1's l1: 1.24432
[800]	training's l1: 0.910384	valid_1's l1: 1.24248
[900]	training's l1: 0.909807	valid_1's l1: 1.24127
[1000]	training's l1: 0.909283	valid_1's l1: 1.23945
[1100]	training's l1: 0.908974	valid_1's l1: 1.23867
[1200]	training's l1: 0.908649	valid_1's l1: 1.23808
[1300]	training's l1: 0.908516	valid_1's l1: 1.23779
[1400]	training's l1: 0.90848	valid_1's l1: 1.23772
[1500]	training's l1: 0.908444	valid_1's l1: 1.2376
[1600]	training's l1: 0.908031	valid_1's l1: 1.23685
[1700]	training's l1: 0.908	valid_1's l1: 1.23676
[1800]	training's l1: 0.907811	valid_1's l1: 1.2362

[32m[I 2021-07-16 03:11:05,127][0m Trial 2 finished with value: -1.2355452150079809 and parameters: {'max_depth': 2, 'min_child_weight': 6, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_lambda': 0.05863782923801954, 'reg_alpha': 1.9824643642165893, 'feature_fraction': 0.4459233731561678, 'bagging_fraction': 0.893100540964213, 'bagging_freq': 18, 'num_leaves': 883, 'min_child_samples': 32}. Best is trial 0 with value: -1.1505835553561288.[0m


Early stopping, best iteration is:
[1846]	training's l1: 0.907733	valid_1's l1: 1.2358
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.869987	valid_1's l1: 1.19314
[200]	training's l1: 0.864509	valid_1's l1: 1.18238


[32m[I 2021-07-16 03:11:13,123][0m Trial 3 finished with value: -1.1822804385537233 and parameters: {'max_depth': 5, 'min_child_weight': 11, 'subsample': 0.7, 'colsample_bytree': 0.7, 'reg_lambda': 47.00233928559371, 'reg_alpha': 0.001269073446808143, 'feature_fraction': 0.3958903642599827, 'bagging_fraction': 0.7667902930971371, 'bagging_freq': 13, 'num_leaves': 571, 'min_child_samples': 69}. Best is trial 0 with value: -1.1505835553561288.[0m


Early stopping, best iteration is:
[154]	training's l1: 0.864516	valid_1's l1: 1.18237
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.83891	valid_1's l1: 1.15465
[200]	training's l1: 0.833096	valid_1's l1: 1.15126


[32m[I 2021-07-16 03:11:22,692][0m Trial 4 finished with value: -1.1498193937605796 and parameters: {'max_depth': 13, 'min_child_weight': 18, 'subsample': 0.6, 'colsample_bytree': 0.5, 'reg_lambda': 0.0029576786367722036, 'reg_alpha': 0.057089458273734545, 'feature_fraction': 0.7109476893223634, 'bagging_fraction': 0.21714410924114447, 'bagging_freq': 20, 'num_leaves': 486, 'min_child_samples': 75}. Best is trial 4 with value: -1.1498193937605796.[0m


Early stopping, best iteration is:
[188]	training's l1: 0.83375	valid_1's l1: 1.15023
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.867671	valid_1's l1: 1.17775
[200]	training's l1: 0.861514	valid_1's l1: 1.16938
[300]	training's l1: 0.860534	valid_1's l1: 1.16827
[400]	training's l1: 0.857902	valid_1's l1: 1.16493
[500]	training's l1: 0.856718	valid_1's l1: 1.16341
[600]	training's l1: 0.855808	valid_1's l1: 1.1615
[700]	training's l1: 0.854104	valid_1's l1: 1.15945
[800]	training's l1: 0.85271	valid_1's l1: 1.15645
[900]	training's l1: 0.850502	valid_1's l1: 1.15544
[1000]	training's l1: 0.849143	valid_1's l1: 1.15398
Early stopping, best iteration is:
[982]	training's l1: 0.849397	valid_1's l1: 1.15378


[32m[I 2021-07-16 03:11:38,694][0m Trial 5 finished with value: -1.1532465630600637 and parameters: {'max_depth': 7, 'min_child_weight': 12, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 991.2043715090208, 'reg_alpha': 0.08909882486824161, 'feature_fraction': 0.2559808457209526, 'bagging_fraction': 0.2593694803528081, 'bagging_freq': 9, 'num_leaves': 352, 'min_child_samples': 96}. Best is trial 4 with value: -1.1498193937605796.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.84334	valid_1's l1: 1.15405
[200]	training's l1: 0.836923	valid_1's l1: 1.14542
[300]	training's l1: 0.835156	valid_1's l1: 1.1449
[400]	training's l1: 0.832288	valid_1's l1: 1.14443
Early stopping, best iteration is:
[349]	training's l1: 0.834096	valid_1's l1: 1.14416


[32m[I 2021-07-16 03:11:53,316][0m Trial 6 finished with value: -1.1439265845815219 and parameters: {'max_depth': 11, 'min_child_weight': 6, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_lambda': 0.4600778708947517, 'reg_alpha': 0.0011210935963340244, 'feature_fraction': 0.509186233880975, 'bagging_fraction': 0.3630745179550666, 'bagging_freq': 14, 'num_leaves': 269, 'min_child_samples': 78}. Best is trial 6 with value: -1.1439265845815219.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.873579	valid_1's l1: 1.19113
[200]	training's l1: 0.866068	valid_1's l1: 1.1785
[300]	training's l1: 0.86574	valid_1's l1: 1.17725


[32m[I 2021-07-16 03:11:59,058][0m Trial 7 finished with value: -1.1770813934561954 and parameters: {'max_depth': 6, 'min_child_weight': 9, 'subsample': 0.5, 'colsample_bytree': 0.9, 'reg_lambda': 414.84955990215076, 'reg_alpha': 58.82023819272894, 'feature_fraction': 0.6863044878412572, 'bagging_fraction': 0.2557862047362869, 'bagging_freq': 9, 'num_leaves': 983, 'min_child_samples': 13}. Best is trial 6 with value: -1.1439265845815219.[0m


Early stopping, best iteration is:
[256]	training's l1: 0.865807	valid_1's l1: 1.17721
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.868583	valid_1's l1: 1.19087
[200]	training's l1: 0.863188	valid_1's l1: 1.18268


[32m[I 2021-07-16 03:12:06,504][0m Trial 8 finished with value: -1.1825763497258281 and parameters: {'max_depth': 5, 'min_child_weight': 19, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_lambda': 3.462801997019065, 'reg_alpha': 1.137807342628682, 'feature_fraction': 0.4169767572519324, 'bagging_fraction': 0.6252827504812117, 'bagging_freq': 14, 'num_leaves': 739, 'min_child_samples': 67}. Best is trial 6 with value: -1.1439265845815219.[0m


Early stopping, best iteration is:
[173]	training's l1: 0.863192	valid_1's l1: 1.18267
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.925692	valid_1's l1: 1.27546
[200]	training's l1: 0.919718	valid_1's l1: 1.26113
[300]	training's l1: 0.915541	valid_1's l1: 1.25246
[400]	training's l1: 0.91442	valid_1's l1: 1.24942
[500]	training's l1: 0.912879	valid_1's l1: 1.2467
[600]	training's l1: 0.91163	valid_1's l1: 1.24429
[700]	training's l1: 0.911166	valid_1's l1: 1.24292
[800]	training's l1: 0.910661	valid_1's l1: 1.24169
[900]	training's l1: 0.909998	valid_1's l1: 1.24021
[1000]	training's l1: 0.909814	valid_1's l1: 1.23959
[1100]	training's l1: 0.909625	valid_1's l1: 1.23918
[1200]	training's l1: 0.909083	valid_1's l1: 1.23807
[1300]	training's l1: 0.909025	valid_1's l1: 1.23808
[1400]	training's l1: 0.908835	valid_1's l1: 1.23734
[1500]	training's l1: 0.908236	valid_1's l1: 1.23652
[1600]	training's l1: 0.908209	valid_1's l1: 1.23651
[1700]	training

[32m[I 2021-07-16 03:13:07,802][0m Trial 9 finished with value: -1.2337770723004067 and parameters: {'max_depth': 2, 'min_child_weight': 19, 'subsample': 0.5, 'colsample_bytree': 0.6, 'reg_lambda': 1.9521029262036558, 'reg_alpha': 0.025944703700245285, 'feature_fraction': 0.21551060185797813, 'bagging_fraction': 0.9408788237086556, 'bagging_freq': 8, 'num_leaves': 646, 'min_child_samples': 85}. Best is trial 6 with value: -1.1439265845815219.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.88185	valid_1's l1: 1.19938
[200]	training's l1: 0.878187	valid_1's l1: 1.19045
[300]	training's l1: 0.877573	valid_1's l1: 1.18885
[400]	training's l1: 0.876132	valid_1's l1: 1.18605
[500]	training's l1: 0.875414	valid_1's l1: 1.18469
[600]	training's l1: 0.874218	valid_1's l1: 1.1831


[32m[I 2021-07-16 03:13:33,858][0m Trial 10 finished with value: -1.182716869556522 and parameters: {'max_depth': 14, 'min_child_weight': 2, 'subsample': 0.9, 'colsample_bytree': 0.6, 'reg_lambda': 0.0027157331772427293, 'reg_alpha': 736.9538533331473, 'feature_fraction': 0.5630728549251931, 'bagging_fraction': 0.4815733143117289, 'bagging_freq': 1, 'num_leaves': 63, 'min_child_samples': 39}. Best is trial 6 with value: -1.1439265845815219.[0m


Early stopping, best iteration is:
[539]	training's l1: 0.874598	valid_1's l1: 1.18298
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.843749	valid_1's l1: 1.15396
[200]	training's l1: 0.83905	valid_1's l1: 1.14822
[300]	training's l1: 0.837813	valid_1's l1: 1.14794
[400]	training's l1: 0.834606	valid_1's l1: 1.14516
[500]	training's l1: 0.827676	valid_1's l1: 1.14635
Early stopping, best iteration is:
[418]	training's l1: 0.832685	valid_1's l1: 1.14465


[32m[I 2021-07-16 03:13:50,138][0m Trial 11 finished with value: -1.1443673657476834 and parameters: {'max_depth': 12, 'min_child_weight': 15, 'subsample': 0.6, 'colsample_bytree': 0.5, 'reg_lambda': 0.0012534222255077814, 'reg_alpha': 0.0010455597029411258, 'feature_fraction': 0.6425412848510487, 'bagging_fraction': 0.43816635641833057, 'bagging_freq': 15, 'num_leaves': 260, 'min_child_samples': 99}. Best is trial 6 with value: -1.1439265845815219.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.852367	valid_1's l1: 1.15998
[200]	training's l1: 0.85097	valid_1's l1: 1.15882
[300]	training's l1: 0.848351	valid_1's l1: 1.15591
[400]	training's l1: 0.84447	valid_1's l1: 1.15077
[500]	training's l1: 0.841335	valid_1's l1: 1.14847
[600]	training's l1: 0.838435	valid_1's l1: 1.14767
[700]	training's l1: 0.835138	valid_1's l1: 1.14455
[800]	training's l1: 0.832225	valid_1's l1: 1.14399
Early stopping, best iteration is:
[742]	training's l1: 0.834046	valid_1's l1: 1.14376


[32m[I 2021-07-16 03:14:16,615][0m Trial 12 finished with value: -1.1433744915859865 and parameters: {'max_depth': 10, 'min_child_weight': 14, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_lambda': 0.042423179471462326, 'reg_alpha': 0.0013863211162166934, 'feature_fraction': 0.5522576490231054, 'bagging_fraction': 0.4789093257810468, 'bagging_freq': 14, 'num_leaves': 155, 'min_child_samples': 100}. Best is trial 12 with value: -1.1433744915859865.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.884295	valid_1's l1: 1.21122
[200]	training's l1: 0.884225	valid_1's l1: 1.21106
[300]	training's l1: 0.883784	valid_1's l1: 1.21047
[400]	training's l1: 0.881854	valid_1's l1: 1.20594
[500]	training's l1: 0.8787	valid_1's l1: 1.20074
[600]	training's l1: 0.87424	valid_1's l1: 1.19217
[700]	training's l1: 0.869014	valid_1's l1: 1.18196
[800]	training's l1: 0.86763	valid_1's l1: 1.17869
[900]	training's l1: 0.865044	valid_1's l1: 1.17494
[1000]	training's l1: 0.859098	valid_1's l1: 1.16447
[1100]	training's l1: 0.855923	valid_1's l1: 1.16078
[1200]	training's l1: 0.853748	valid_1's l1: 1.15759
[1300]	training's l1: 0.851335	valid_1's l1: 1.15336
[1400]	training's l1: 0.849904	valid_1's l1: 1.15257
[1500]	training's l1: 0.84854	valid_1's l1: 1.15154
[1600]	training's l1: 0.847306	valid_1's l1: 1.15046
[1700]	training's l1: 0.846312	valid_1's l1: 1.15038
[1800]	training's l1: 0.845156	valid_1's l1: 1.1492

[32m[I 2021-07-16 03:15:15,068][0m Trial 13 finished with value: -1.1450763197211513 and parameters: {'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.9, 'colsample_bytree': 0.6, 'reg_lambda': 0.040250894479830795, 'reg_alpha': 0.004728827234527957, 'feature_fraction': 0.532346729340247, 'bagging_fraction': 0.536663408744462, 'bagging_freq': 5, 'num_leaves': 33, 'min_child_samples': 88}. Best is trial 12 with value: -1.1433744915859865.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.85347	valid_1's l1: 1.164
[200]	training's l1: 0.849545	valid_1's l1: 1.158
[300]	training's l1: 0.847468	valid_1's l1: 1.15549
[400]	training's l1: 0.844341	valid_1's l1: 1.15336
[500]	training's l1: 0.841187	valid_1's l1: 1.15055
[600]	training's l1: 0.838819	valid_1's l1: 1.14838
[700]	training's l1: 0.836539	valid_1's l1: 1.14786
[800]	training's l1: 0.833971	valid_1's l1: 1.14576
[900]	training's l1: 0.832289	valid_1's l1: 1.14695
Early stopping, best iteration is:
[803]	training's l1: 0.833881	valid_1's l1: 1.14571


[32m[I 2021-07-16 03:15:45,453][0m Trial 14 finished with value: -1.1453763793504657 and parameters: {'max_depth': 16, 'min_child_weight': 15, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_lambda': 0.195065570509387, 'reg_alpha': 12.803342323299766, 'feature_fraction': 0.3018534903402847, 'bagging_fraction': 0.7074641164969618, 'bagging_freq': 12, 'num_leaves': 193, 'min_child_samples': 100}. Best is trial 12 with value: -1.1433744915859865.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.850679	valid_1's l1: 1.15989
[200]	training's l1: 0.847974	valid_1's l1: 1.15516
[300]	training's l1: 0.843197	valid_1's l1: 1.15166
[400]	training's l1: 0.839548	valid_1's l1: 1.14836
[500]	training's l1: 0.834174	valid_1's l1: 1.14829
Early stopping, best iteration is:
[452]	training's l1: 0.837265	valid_1's l1: 1.14731


[32m[I 2021-07-16 03:16:02,245][0m Trial 15 finished with value: -1.1470461440125512 and parameters: {'max_depth': 11, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_lambda': 0.014333382647731428, 'reg_alpha': 0.007033560120611144, 'feature_fraction': 0.513475308631619, 'bagging_fraction': 0.3959407047017276, 'bagging_freq': 16, 'num_leaves': 192, 'min_child_samples': 85}. Best is trial 12 with value: -1.1433744915859865.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.840386	valid_1's l1: 1.16167
[200]	training's l1: 0.83345	valid_1's l1: 1.15577
[300]	training's l1: 0.830629	valid_1's l1: 1.15424
[400]	training's l1: 0.828633	valid_1's l1: 1.15253
[500]	training's l1: 0.82637	valid_1's l1: 1.15058
[600]	training's l1: 0.825244	valid_1's l1: 1.14988
Early stopping, best iteration is:
[562]	training's l1: 0.825572	valid_1's l1: 1.14952


[32m[I 2021-07-16 03:16:27,551][0m Trial 16 finished with value: -1.1492850665498098 and parameters: {'max_depth': 9, 'min_child_weight': 15, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_lambda': 22.795938818779796, 'reg_alpha': 0.21492353155726673, 'feature_fraction': 0.9905152745242837, 'bagging_fraction': 0.5839275682973061, 'bagging_freq': 12, 'num_leaves': 394, 'min_child_samples': 47}. Best is trial 12 with value: -1.1433744915859865.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.858412	valid_1's l1: 1.16959
[200]	training's l1: 0.8546	valid_1's l1: 1.16285
[300]	training's l1: 0.850696	valid_1's l1: 1.15764
[400]	training's l1: 0.845494	valid_1's l1: 1.1508
[500]	training's l1: 0.841777	valid_1's l1: 1.14939
[600]	training's l1: 0.838892	valid_1's l1: 1.14921
[700]	training's l1: 0.836701	valid_1's l1: 1.14811
[800]	training's l1: 0.834897	valid_1's l1: 1.14816
Early stopping, best iteration is:
[784]	training's l1: 0.835353	valid_1's l1: 1.14767


[32m[I 2021-07-16 03:16:48,727][0m Trial 17 finished with value: -1.1472589821683363 and parameters: {'max_depth': 15, 'min_child_weight': 6, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_lambda': 0.3983626087173202, 'reg_alpha': 0.005914035251534058, 'feature_fraction': 0.3536591507302146, 'bagging_fraction': 0.346086466959584, 'bagging_freq': 16, 'num_leaves': 151, 'min_child_samples': 89}. Best is trial 12 with value: -1.1433744915859865.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.844529	valid_1's l1: 1.15552
[200]	training's l1: 0.840148	valid_1's l1: 1.15207
[300]	training's l1: 0.837092	valid_1's l1: 1.15035
[400]	training's l1: 0.83128	valid_1's l1: 1.14815
[500]	training's l1: 0.827503	valid_1's l1: 1.14751
[600]	training's l1: 0.8237	valid_1's l1: 1.14606
[700]	training's l1: 0.820472	valid_1's l1: 1.14644
Early stopping, best iteration is:
[626]	training's l1: 0.822614	valid_1's l1: 1.14587


[32m[I 2021-07-16 03:17:17,848][0m Trial 18 finished with value: -1.145375135311367 and parameters: {'max_depth': 18, 'min_child_weight': 13, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_lambda': 0.016145214039615684, 'reg_alpha': 0.22273607137858872, 'feature_fraction': 0.47568277828490246, 'bagging_fraction': 0.4796659649650338, 'bagging_freq': 11, 'num_leaves': 316, 'min_child_samples': 56}. Best is trial 12 with value: -1.1433744915859865.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.864354	valid_1's l1: 1.18101
[200]	training's l1: 0.861452	valid_1's l1: 1.17419
[300]	training's l1: 0.859186	valid_1's l1: 1.17023
[400]	training's l1: 0.85582	valid_1's l1: 1.16305
[500]	training's l1: 0.851346	valid_1's l1: 1.15622
[600]	training's l1: 0.848952	valid_1's l1: 1.15511
[700]	training's l1: 0.845521	valid_1's l1: 1.15162
[800]	training's l1: 0.843293	valid_1's l1: 1.15023
Early stopping, best iteration is:
[771]	training's l1: 0.843384	valid_1's l1: 1.15013


[32m[I 2021-07-16 03:17:48,078][0m Trial 19 finished with value: -1.149983824194756 and parameters: {'max_depth': 9, 'min_child_weight': 8, 'subsample': 0.8, 'colsample_bytree': 0.7, 'reg_lambda': 16.821751816948378, 'reg_alpha': 0.001090473507521142, 'feature_fraction': 0.6107398938534028, 'bagging_fraction': 0.6399057882501817, 'bagging_freq': 7, 'num_leaves': 87, 'min_child_samples': 78}. Best is trial 12 with value: -1.1433744915859865.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.889938	valid_1's l1: 1.21116
[200]	training's l1: 0.888592	valid_1's l1: 1.20767
[300]	training's l1: 0.887581	valid_1's l1: 1.20617
[400]	training's l1: 0.886467	valid_1's l1: 1.20273
[500]	training's l1: 0.884503	valid_1's l1: 1.19764
[600]	training's l1: 0.88267	valid_1's l1: 1.19221
[700]	training's l1: 0.881783	valid_1's l1: 1.19107


[32m[I 2021-07-16 03:18:01,457][0m Trial 20 finished with value: -1.1907437667359801 and parameters: {'max_depth': 17, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 0.6, 'reg_lambda': 0.5388595803099507, 'reg_alpha': 828.0802132324297, 'feature_fraction': 0.7826431760322485, 'bagging_fraction': 0.330223644118916, 'bagging_freq': 5, 'num_leaves': 428, 'min_child_samples': 99}. Best is trial 12 with value: -1.1433744915859865.[0m


Early stopping, best iteration is:
[699]	training's l1: 0.881785	valid_1's l1: 1.19106
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.842993	valid_1's l1: 1.15299
[200]	training's l1: 0.838695	valid_1's l1: 1.14982
[300]	training's l1: 0.836864	valid_1's l1: 1.14842
[400]	training's l1: 0.832529	valid_1's l1: 1.14586
[500]	training's l1: 0.828199	valid_1's l1: 1.14488
[600]	training's l1: 0.824036	valid_1's l1: 1.14531


In [23]:
print(f'best_score = {-study1.best_value}')
study1.best_trial.params

best_score = 1.1377244952432373


{'max_depth': 20,
 'min_child_weight': 6,
 'subsample': 0.6,
 'colsample_bytree': 0.7,
 'reg_lambda': 0.08135847923727514,
 'reg_alpha': 1.49433391288005,
 'feature_fraction': 0.5197130924180956,
 'bagging_fraction': 0.7648977376363217,
 'bagging_freq': 13,
 'num_leaves': 181,
 'min_child_samples': 17}

In [24]:
print(f'best_score = {-study2.best_value}')
study2.best_trial.params

best_score = 2.1926348140676515


{'max_depth': 16,
 'min_child_weight': 3,
 'subsample': 0.7,
 'colsample_bytree': 0.5,
 'reg_lambda': 0.004528826382334437,
 'reg_alpha': 1.9645054145486578,
 'feature_fraction': 0.9015395368511052,
 'bagging_fraction': 0.9958830097531277,
 'bagging_freq': 12,
 'num_leaves': 761,
 'min_child_samples': 96}

In [25]:
print(f'best_score = {-study3.best_value}')
study3.best_trial.params

best_score = 0.9225472847558365


{'max_depth': 5,
 'min_child_weight': 18,
 'subsample': 0.5,
 'colsample_bytree': 0.5,
 'reg_lambda': 0.02821111013127755,
 'reg_alpha': 0.0046946396301503586,
 'feature_fraction': 0.9780515566739537,
 'bagging_fraction': 0.7202933086435114,
 'bagging_freq': 11,
 'num_leaves': 137,
 'min_child_samples': 45}

In [26]:
print(f'best_score = {-study4.best_value}')
study4.best_trial.params

best_score = 1.5576931483677667


{'max_depth': 9,
 'min_child_weight': 8,
 'subsample': 0.6,
 'colsample_bytree': 0.8,
 'reg_lambda': 24.80468829330036,
 'reg_alpha': 0.3674254895182708,
 'feature_fraction': 0.8268416192212926,
 'bagging_fraction': 0.36802486339139545,
 'bagging_freq': 19,
 'num_leaves': 909,
 'min_child_samples': 62}