In [1]:
import gc
import os
import sys
import math
import random
import warnings
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)
import pickle
import optuna
from tqdm import tqdm
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# from pandarallel import pandarallel
# pandarallel.initialize()
warnings.simplefilter("ignore")
import ctypes as ct
from datetime import timedelta

from sklearn.metrics import mean_absolute_error

import statistics as st
import lightgbm as lgbm
from scipy.stats import norm

In [2]:
sys.path.append('../../')
import src.utils as utils

## Param

In [3]:
EXP_NUM = 92
NFOLDS = 5
SEED = 420
TRAIN_DATE = 'date < 20210701'

In [4]:
# def set_seed(seed: int = 42):
#     random.seed(seed)
#     np.random.seed(seed)
#     os.environ["PYTHONHASHSEED"] = str(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)  # type: ignore
#     torch.backends.cudnn.deterministic = True  # type: ignore
#     torch.backends.cudnn.benchmark = False  # type: ignore
# set_seed(SEED)

## Dir

In [5]:
DATA_DIR = Path("/home/knikaido/work/MLB-Player-Digital-Engagement-Forecasting/data/")
MAIN_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting-update'
TRAIN_DIR = MAIN_DATA_DIR / 'train'
OUTPUT_DIR = Path('./output/')

In [6]:
players = pd.read_csv(MAIN_DATA_DIR / 'players.csv')
seasons = pd.read_csv(MAIN_DATA_DIR / 'seasons.csv')
teams = pd.read_csv(MAIN_DATA_DIR / 'teams.csv')

rosters = pd.read_csv(TRAIN_DIR / 'rosters_train.csv').query(TRAIN_DATE)
targets = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv').query(TRAIN_DATE)
scores = pd.read_csv(TRAIN_DIR / 'playerBoxScores_train.csv').query(TRAIN_DATE)
scores = scores.groupby(['playerId', 'date']).sum().reset_index()
standings = pd.read_csv(TRAIN_DIR / 'standings_train.csv').query(TRAIN_DATE)
playerTwitterFollowers = pd.read_csv(TRAIN_DIR / 'playerTwitterFollowers_train.csv').query(TRAIN_DATE)
awards = pd.read_csv(TRAIN_DIR / 'awards_train.csv').query(TRAIN_DATE)


In [7]:
train_elements_dict = {"players":players, 
                       "rosters":rosters, 
                       "targets":targets, 
                       "scores":scores, 
                       "seasons":seasons, 
                       "teams":teams, 
                       "standings":standings,
                       'awards':awards}

In [8]:
def map_team_name(name):
    names = name.split('-')
    result = ''
    for n in names:
        if n == 'st':
            n = 'st.'
        result += f' {n.capitalize()}'

    return result[1:]

In [9]:
def calc_corr(df):
    # 相関係数行列を作成
    corr_mat = df.corr(method='pearson')

    # 行（列）サイズを取得
    n = corr_mat.shape[0]
    corr_ary = []

    for i in range(n):
        for j in range(i):
            if i == j:
                continue
            corr_ary.append(corr_mat.iloc[i,j])

    return corr_ary

In [10]:
def calc_probs(pid,df,temp):
    to_append=[pid,'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','']
    targets=['target1','target2','target3','target4']
    z=1
    for target in targets:
        target_prob = temp[target].tolist()
        mean = np.mean(target_prob)
        std = np.std(target_prob)
        median = st.median(target_prob)
        distribution = norm(mean, std)
        min_weight = min(target_prob)
        max_weight = max(target_prob)
        values = list(np.linspace(min_weight, max_weight))
        probabilities = [distribution.pdf(v) for v in values]
        max_value = max(probabilities)
        max_index = probabilities.index(max_value)
        to_append[z]=mean
        to_append[z+1]=median
        to_append[z+2]=std
        to_append[z+3]=min_weight
        to_append[z+4]=max_weight
        to_append[z+5]=temp[target].skew()
        to_append[z+6]=temp[target].kurt()

        z=z+7
    corr_ = calc_corr(temp[['target1', 'target2', 'target3', 'target4']])
    to_append[z:] = corr_  
    df_length = len(df)
    df.loc[df_length] = to_append
    return df

In [11]:
def count_consecutive_items_n_cols(df, col_name_list, output_col):
    cum_sum_list = [
        (df[col_name] != df[col_name].shift(1)).cumsum().tolist() for col_name in col_name_list
    ]
    df[output_col] = df.groupby(
        ["_".join(map(str, x)) for x in zip(*cum_sum_list)]
    ).cumcount() + 1
    return df

In [12]:
def extract_season(date_raw, season_start_end):
    idxes = 0
    for raw in season_start_end.iloc():
        idx_ = ((date_raw >= raw.iloc[0]) & (date_raw <= raw.iloc[1])) * 1
        idxes += idx_
    return idxes

In [13]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, verbose=100):
    def opt(trial):
        params = {
                'random_state': SEED,
                'objective':'mae',
                'n_estimators': 10000,
                'learning_rate': 0.1,
                'max_depth': trial.suggest_int('max_depth', 1, 20),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
                'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
                'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1),
                'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 1e3),
                'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 1e3),
                'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
                'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
                'bagging_freq': trial.suggest_int('bagging_freq', 1, 20),
                'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
        }

        model_opt = lgbm.LGBMRegressor(**params)

        model_opt.fit(x_train, y_train, 
            eval_set=[(x_train, y_train), (x_valid, y_valid)],  
            early_stopping_rounds=verbose, 
            verbose=verbose)
        oof_pred = model_opt.predict(x_valid)
        oof_pred = np.clip(oof_pred, 0, 100)
        score = mean_absolute_error(oof_pred, y_valid)
        return -score
    return opt

In [14]:
# def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
#     oof_pred = np.zeros(len(y_valid), dtype=np.float32)
#     model = lgbm.LGBMRegressor(**params)
#     model.fit(x_train, y_train, 
#         eval_set=[(x_valid, y_valid)],  
#         early_stopping_rounds=verbose, 
#         verbose=verbose)
#     oof_pred = model.predict(x_valid)
#     oof_pred = np.clip(oof_pred, 0, 100)
#     score = mean_absolute_error(oof_pred, y_valid)
#     print('mae:', score)
#     return oof_pred, model, score

In [15]:
def my_timeseries_fold(train):
    
    tr_idxs = []
    val_idxs = []
        
    tr_idx = (train['date'].astype(int) < 20200801)
    val_idx = (train['date'].astype(int) >= 20200801) & (train['date'].astype(int) < 20200901)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)

    tr_idx = (train['date'].astype(int) < 20200901)
    val_idx = (train['date'].astype(int) >= 20200901) & (train['date'].astype(int) < 20201001)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)
    
    tr_idx = (train['date'].astype(int) < 20210401)
    val_idx = (train['date'].astype(int) >= 20210401) & (train['date'].astype(int) < 20210501)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)

    tr_idx = (train['date'].astype(int) < 20210501)
    val_idx = (train['date'].astype(int) >= 20210501) & (train['date'].astype(int) < 20210601)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)
    
    tr_idx = (train['date'].astype(int) < 20210601)
    val_idx = (train['date'].astype(int) >= 20210601) & (train['date'].astype(int) < 20210701)
    tr_idxs.append(tr_idx)
    val_idxs.append(val_idx)
    
    return tr_idxs, val_idxs

In [16]:
class Rt4kaidoTrain:
    def __init__(self, usetimelinefeature=False):
        
        self.usetimelinefeature = usetimelinefeature
        self.targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
        self.players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'mlbDebutDate', 'DebutAge', 'heightInches', 'weight', 'playerForTestSetAndFuturePreds']
        self.rosters_cols = ['playerId', 'teamId', 'status', 'date']
        self.salaries_cols = ['teamId', 'salary', 'year']
        self.standings_cols = ['teamId', 'wildCardRank', 'sportGamesBack', 'date']
        self.transactions_cols = ['playerId', 'transaction_flag', 'date']
        self.stat_cols = ["playerId", "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
                        "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
                        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
                        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt",
                        'tgt1_2_corr', 'tgt1_3_corr', 'tgt2_3_corr', 'tgt1_4_corr', 'tgt2_4_corr', 'tgt3_4_corr']

        self.scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances', 'date']

        self.feature_cols1 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank', 'award_flag'] 

        self.feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank', 'award_flag'] 

        self.feature_cols3 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank', 'diffmlbDebutDateflag', 'sincemlbDebutDateflag', 'award_flag'] 

        self.feature_cols4 = ['week_day', 'annual_day', 'month', 'label_playerId', 'label_primaryPositionName', 'label_teamId', 'label_birthCity',
                        'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank', 'diffmlbDebutDateflag', 'sincemlbDebutDateflag', 'award_flag'] 
        
        
        # lightgbm
        self.params1 = {'objective':'mae',
                       'reg_alpha': 0.14947461820098767, 
                       'reg_lambda': 0.10185644384043743, 
                       'n_estimators': 3633, 
                       'learning_rate': 0.08046301304430488, 
                       'num_leaves': 674, 
                       'feature_fraction': 0.9101240539122566, 
                       'bagging_fraction': 0.9884451442950513, 
                       'bagging_freq': 8, 
                       'min_child_samples': 51}


        self.params2 = {'objective':'mae',
                       'reg_alpha': 0.1,
                       'reg_lambda': 0.1, 
                       'n_estimators': 80,
                       'learning_rate': 0.1,
                       'random_state': 42,
                       "num_leaves": 22}



        self.params3 = {'objective':'mae',
                       'reg_alpha': 0.1,
                       'reg_lambda': 0.1, 
                       'n_estimators': 10000,
                       'learning_rate': 0.1,
                       'random_state': 42,
                       "num_leaves": 100}

        self.params4 = {'objective':'mae',
                       'reg_alpha': 0.016468100279441976, 
                       'reg_lambda': 0.09128335764019105, 
                       'n_estimators': 9868, 
                       'learning_rate': 0.10528150510326864, 
                       'num_leaves': 157, 
                       'feature_fraction': 0.5419185713426886, 
                       'bagging_fraction': 0.2637405128936662, 
                       'bagging_freq': 19, 
                       'min_child_samples': 71}

    def make_feature(self, train_elements_dict):

        players = train_elements_dict['players']
        rosters = train_elements_dict['rosters']
        targets = train_elements_dict['targets']
        scores = train_elements_dict['scores']
        seasons = train_elements_dict['seasons']
        teams = train_elements_dict['teams']
        standings = train_elements_dict['standings']

        print('calc target stat ... ', end="")

        ## target stats
        targets_train = targets.merge(rosters[self.rosters_cols], on=['playerId', 'date'], how='left')
        targets_train = targets_train[(targets_train['date'] >= 20210601) & (targets_train['date'] < 20210701)]

        playerId_list = targets_train['playerId'].unique()
        player_target_probs = pd.DataFrame(columns = self.stat_cols)  
        for pid in tqdm(playerId_list):
            temp = targets_train[targets_train['playerId'] == pid]
            player_target_stats=calc_probs(pid,player_target_probs,temp)

        teamId_list = targets_train['teamId'].dropna().unique()
        team_target_probs = pd.DataFrame(columns = self.stat_cols)
        for pid in tqdm(teamId_list):
            temp = targets_train[targets_train['teamId'] == pid]
            team_target_stats=calc_probs(pid,team_target_probs,temp)

        team_stat_cols = self.stat_cols
        team_stat_cols = team_stat_cols[:1] + ["team_" + word for word in team_stat_cols[1:]]
        team_target_stats.columns = team_stat_cols

        self.feature_cols1 += self.stat_cols[1:-6]
        self.feature_cols2 += self.stat_cols[1:-6]
        self.feature_cols3 += self.stat_cols[1:-6]
        self.feature_cols4 += self.stat_cols[1:-6]

        self.feature_cols1 += team_stat_cols[1:]
        self.feature_cols2 += team_stat_cols[1:]
        self.feature_cols3 += team_stat_cols[1:]
        self.feature_cols4 += team_stat_cols[1:]

        print('done.')

        print('preprocess ... ', end="")
        ## salaries
        # salaries = salaries.groupby(['year', 'team']).sum()['salary'].reset_index()
        # salaries['team'] = salaries['team'].apply(map_team_name)
        # salaries = salaries.merge(teams, left_on='team', right_on='name', how='inner')
        # salaries = salaries.rename(columns={'id': 'teamId'})

        ## seasons
        seasons = seasons.fillna('0000-00-00')
        for c_ in seasons.columns[1:]:
            seasons[c_] = seasons[c_].str.replace('-', '').astype(int)

        ## players
        players['DOY'] = pd.to_datetime(players['DOB'], format="%Y-%m-%d").dt.year
        players['mlbDebutYear'] = pd.to_datetime(players['mlbDebutDate'], format="%Y-%m-%d").dt.year
        players['DebutAge'] = players['mlbDebutYear'] - players['DOY']
        players['mlbDebutDate'] = pd.to_numeric(players['mlbDebutDate'].str.replace('-', ''), errors="coerce")

        print('done.')

        print('creat feature ... ', end="")
        # creat feature
        train = targets[self.targets_cols].merge(players[self.players_cols], on=['playerId'], how='left')
        train = train.merge(rosters[self.rosters_cols], on=['playerId', 'date'], how='left')
        train = train.merge(scores[self.scores_cols], on=['playerId', 'date'], how='left')
        train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])
        train = train.merge(standings[self.standings_cols], on=['teamId', 'date'], how='left')
        train = train.merge(team_target_stats, how='left', left_on=["teamId"],right_on=["playerId"], suffixes=('', 'team_'))
        train = train.merge(awards, on=['playerId', 'date'], how='left')

        
        date_ = pd.to_datetime(train['date'], format="%Y%m%d")
        train['annual_day'] = (date_ - pd.to_datetime(date_.dt.year, format="%Y")) /  timedelta(days=1)
        train['week_day'] = date_.dt.weekday
        train['month'] = date_.dt.month
        train['year'] = date_.dt.year
        train['mlbDebutDateflag'] = (train['mlbDebutDate'] == train['date']) * 1
        train['sincemlbDebutDateflag'] = (train['date'] >= train['mlbDebutDate']) * 1
        train['diffmlbDebutDateflag'] = (train['date'] - train['mlbDebutDate'])

        # label encoding
        player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
        position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
        birthCityn2num = {c: i for i, c in enumerate(train['birthCity'].unique())}
        teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
        status2num = {c: i for i, c in enumerate(train['status'].unique())}
        awardName2num = {c: i for i, c in enumerate(train['awardName'].unique())}
        train['label_playerId'] = train['playerId'].map(player2num)
        train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
        train['label_birthCity'] = train['birthCity'].map(birthCityn2num)
        train['label_teamId'] = train['teamId'].map(teamid2num)
        train['label_status'] = train['status'].map(status2num)
        train['award_flag'] = train['awardSeason'].isna()*1


        ## season_info
        on_preseason_idxes = extract_season(train['date'], seasons[['preSeasonStartDate', 'preSeasonEndDate']])
        on_season_idxes = extract_season(train['date'], seasons[['regularSeasonStartDate', 'regularSeasonEndDate']]) * 2
        on_postseason_idxes = extract_season(train['date'], seasons[['postSeasonStartDate', 'postSeasonEndDate']]) * 3

        special_days = seasons['lastDate1stHalf'].to_list() + seasons['allStarDate'].to_list() + seasons['firstDate2ndHalf'].to_list()
        special_idxes = 0
        for day in special_days:
            special_idxes += (train['date'] == day) * 4

        on_total_season_idxes = on_preseason_idxes
        on_total_season_idxes[on_season_idxes==2] = 2
        on_total_season_idxes[on_postseason_idxes==3] = 3
        on_total_season_idxes[special_idxes==4] = 4

        train['season_info'] = on_total_season_idxes

        ## only on season
        on_whole_idxes = extract_season(train['date'], seasons[['seasonStartDate', 'seasonEndDate']])
        train = train[on_whole_idxes == 1].reset_index(drop=True)

        # train = train.merge(playerTwitterFollowers, how='left', on=["playerId", 'date'])

        ## only test_player
#         train = train[train['playerForTestSetAndFuturePreds']==True].reset_index(drop=True)

        print('done.')
        
        train_features_dict = {'players': players,
                               'seasons': seasons,
                                'player_target_stats': player_target_stats,
                                'team_target_stats': team_target_stats,
                                'player2num': player2num, 
                                'position2num': position2num, 
                                'birthCityn2num': birthCityn2num,
                                'teamid2num': teamid2num,
                                'status2num': status2num,
                                'feature_cols1': self.feature_cols1,
                                'feature_cols2': self.feature_cols2,
                                'feature_cols3': self.feature_cols3,
                                'feature_cols4': self.feature_cols4,
                                'rosters_cols_all': list(rosters.columns),
                                'scores_cols_all': list(scores.columns),
                                'standings_cols_all': list(standings.columns),
                                'awards_cols_all': list(awards.columns)
                              }
        
        if self.usetimelinefeature:
            ## game_info
            train['gameday'] = ~train['battingOrder'].isna()*1
            train.sort_values(by=['playerId','date'],inplace=True,ascending=True)

            train=count_consecutive_items_n_cols(train,['playerId','gameday'],'daysSinceLastGame')
            train.loc[train['gameday']==1,'daysSinceLastGame'] = 0

            train_game = train[train['gameday']==1]
            train_last_game = train_game[~train_game.duplicated(subset='playerId', keep='last')][['playerId', 'date']]
            train_last_game.columns = ['playerId', 'lastdate']
            train_player_unique = pd.DataFrame(train['playerId'].unique(), columns=['playerId'])
            train_last_game = pd.merge(train_player_unique, train_last_game, on=['playerId'], how='left' )
            train_last_game = train_last_game.fillna(20171231)
            
            train_features_dict['train_last_game'] = train_last_game
            self.feature_cols1 += ['daysSinceLastGame']
            self.feature_cols2 += ['daysSinceLastGame']
            self.feature_cols3 += ['daysSinceLastGame']
            self.feature_cols4 += ['daysSinceLastGame']
            
            
            ## rosters_info
            train['rosterday'] = ~train['status'].isna()*1
            train.sort_values(by=['playerId','date'],inplace=True,ascending=True)

            train=count_consecutive_items_n_cols(train,['playerId','rosterday'],'daysSinceLastRoster')
            train.loc[train['rosterday']==1,'daysSinceLastRoster'] = 0

            train_roster= train[train['rosterday']==1]
            train_last_roster = train_roster[~train_roster.duplicated(subset='playerId', keep='last')][['playerId', 'date']]
            train_last_roster.columns = ['playerId', 'lastroster']
            train_player_unique = pd.DataFrame(train['playerId'].unique(), columns=['playerId'])
            train_last_roster = pd.merge(train_player_unique, train_last_roster, on=['playerId'], how='left' )
            train_last_roster = train_last_roster.fillna(20171231)
            
            train_features_dict['train_last_roster'] = train_last_roster
            self.feature_cols1 += ['daysSinceLastRoster']
            self.feature_cols2 += ['daysSinceLastRoster']
            self.feature_cols3 += ['daysSinceLastRoster']
            self.feature_cols4 += ['daysSinceLastRoster']


        return train, train_features_dict
    
    def train_and_evaluate(self, train, isgamedayonly=False):
        
        if isgamedayonly:
            train = train[train['gameday'] == 1].reset_index(drop=True)
            
        train_X = train
        train_y = train[['target1', 'target2', 'target3', 'target4']]

        oof = np.zeros(train_y.shape) - 1.0
        y_valids = np.zeros(train_y.shape) - 1.0

        tr_idx = (train['date'].astype(int) < 20210401)
        val_idx = ~tr_idx

        x_train = train_X.loc[tr_idx].reset_index(drop=True)
        y_train = train_y.loc[tr_idx].reset_index(drop=True)
        x_valid = train_X.loc[val_idx].reset_index(drop=True)
        y_valid = train_y.loc[val_idx].reset_index(drop=True)

        study1 = optuna.create_study(direction='maximize')
        study1.optimize(fit_lgbm(x_train[self.feature_cols1], y_train['target1'], 
                                 x_valid[self.feature_cols1], y_valid['target1']), n_trials=100)
        
        study2 = optuna.create_study(direction='maximize')
        study2.optimize(fit_lgbm(x_train[self.feature_cols2], y_train['target2'], 
                                 x_valid[self.feature_cols2], y_valid['target2']), n_trials=100)
        
        study3 = optuna.create_study(direction='maximize')
        study3.optimize(fit_lgbm(x_train[self.feature_cols3], y_train['target3'], 
                                 x_valid[self.feature_cols3], y_valid['target3']), n_trials=100)
        
        study4 = optuna.create_study(direction='maximize')
        study4.optimize(fit_lgbm(x_train[self.feature_cols4], y_train['target4'], 
                                 x_valid[self.feature_cols4], y_valid['target4']), n_trials=100)

#         oof1, model1, score1 = fit_lgbm(
#             x_train[self.feature_cols1], y_train['target1'],
#             x_valid[self.feature_cols1], y_valid['target1'],
# #             self.params1
#         )
#         oof2, model2, score2 = fit_lgbm(
#             x_train[self.feature_cols2], y_train['target2'],
#             x_valid[self.feature_cols2], y_valid['target2'],
# #             self.params2
#         )
#         oof3, model3, score3 = fit_lgbm(
#             x_train[self.feature_cols3], y_train['target3'],
#             x_valid[self.feature_cols3], y_valid['target3'],
# #             self.params3
#         )
#         oof4, model4, score4 = fit_lgbm(
#             x_train[self.feature_cols4], y_train['target4'],
#             x_valid[self.feature_cols4], y_valid['target4'],
# #             self.params4
#         )

#         score = (score1+score2+score3+score4) / 4
#         print(f'score: {score}')

#         oof[val_idx, 0] = oof1
#         oof[val_idx, 1] = oof2
#         oof[val_idx, 2] = oof3
#         oof[val_idx, 3] = oof4
#         y_valids[val_idx, 0] = y_valid['target1'].values
#         y_valids[val_idx, 1] = y_valid['target2'].values
#         y_valids[val_idx, 2] = y_valid['target3'].values
#         y_valids[val_idx, 3] = y_valid['target4'].values

#         mae = mean_absolute_error(y_valids[val_idx, :], oof[val_idx, :])
#         print("mae:", mae)

#         val_idx_num = val_idx[val_idx==True].index.to_list()

#         oof_df = train[self.targets_cols]
#         oof_df.iloc[val_idx_num, 1:5] = oof[val_idx_num, :]

#         models = np.array([model1, model2, model3, model4])

        return study1, study2, study3, study4

In [17]:
rt4kaido_train = Rt4kaidoTrain(usetimelinefeature=True)
train, train_features_dict = rt4kaido_train.make_feature(train_elements_dict)

calc target stat ... 

100%|██████████| 2061/2061 [00:46<00:00, 44.71it/s]
100%|██████████| 30/30 [00:00<00:00, 41.93it/s]


done.
preprocess ... done.
creat feature ... done.


In [None]:
study1, study2, study3, study4 = rt4kaido_train.train_and_evaluate(train, isgamedayonly=False)

[32m[I 2021-07-31 02:02:48,510][0m A new study created in memory with name: no-name-0de3f52f-f9fd-44d1-86bf-73ef530136e2[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.615782	valid_1's l1: 0.655339
[200]	training's l1: 0.609355	valid_1's l1: 0.647056
[300]	training's l1: 0.608204	valid_1's l1: 0.645354
[400]	training's l1: 0.605633	valid_1's l1: 0.641472
[500]	training's l1: 0.604924	valid_1's l1: 0.640355
[600]	training's l1: 0.603417	valid_1's l1: 0.637968
[700]	training's l1: 0.6028	valid_1's l1: 0.637098
[800]	training's l1: 0.601549	valid_1's l1: 0.635987
[900]	training's l1: 0.600908	valid_1's l1: 0.635151
[1000]	training's l1: 0.599576	valid_1's l1: 0.63306
[1100]	training's l1: 0.598647	valid_1's l1: 0.631369
[1200]	training's l1: 0.597713	valid_1's l1: 0.629921
[1300]	training's l1: 0.597425	valid_1's l1: 0.62948
[1400]	training's l1: 0.597143	valid_1's l1: 0.629324
[1500]	training's l1: 0.596943	valid_1's l1: 0.629152
[1600]	training's l1: 0.596672	valid_1's l1: 0.628975
[1700]	training's l1: 0.596479	valid_1's l1: 0.628868
[1800]	training's l1: 0.595992	va

[32m[I 2021-07-31 02:04:31,871][0m Trial 0 finished with value: -0.6265941701196592 and parameters: {'max_depth': 4, 'min_child_weight': 11, 'subsample': 0.6, 'colsample_bytree': 0.7, 'reg_lambda': 0.08637672735574023, 'reg_alpha': 0.09098155986337521, 'feature_fraction': 0.9899377742090449, 'bagging_fraction': 0.8713914162677172, 'bagging_freq': 17, 'num_leaves': 304, 'min_child_samples': 91}. Best is trial 0 with value: -0.6265941701196592.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.600638	valid_1's l1: 0.638294
[200]	training's l1: 0.598293	valid_1's l1: 0.63567
[300]	training's l1: 0.597564	valid_1's l1: 0.635227
[400]	training's l1: 0.596108	valid_1's l1: 0.632889
[500]	training's l1: 0.595395	valid_1's l1: 0.631848
[600]	training's l1: 0.59443	valid_1's l1: 0.630949
[700]	training's l1: 0.593651	valid_1's l1: 0.630075
[800]	training's l1: 0.593235	valid_1's l1: 0.630011
Early stopping, best iteration is:
[769]	training's l1: 0.593397	valid_1's l1: 0.62992


[32m[I 2021-07-31 02:05:21,106][0m Trial 1 finished with value: -0.6297487776092456 and parameters: {'max_depth': 15, 'min_child_weight': 7, 'subsample': 0.6, 'colsample_bytree': 0.5, 'reg_lambda': 553.6474906969003, 'reg_alpha': 294.5973639270887, 'feature_fraction': 0.6091978838609617, 'bagging_fraction': 0.6178186862743644, 'bagging_freq': 14, 'num_leaves': 246, 'min_child_samples': 74}. Best is trial 0 with value: -0.6265941701196592.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.585563	valid_1's l1: 0.624557
[200]	training's l1: 0.581165	valid_1's l1: 0.621646
[300]	training's l1: 0.577095	valid_1's l1: 0.620857
[400]	training's l1: 0.570491	valid_1's l1: 0.620033
[500]	training's l1: 0.563386	valid_1's l1: 0.61879
[600]	training's l1: 0.558946	valid_1's l1: 0.618972
Early stopping, best iteration is:
[526]	training's l1: 0.561098	valid_1's l1: 0.618707


[32m[I 2021-07-31 02:06:27,237][0m Trial 2 finished with value: -0.6184027729146927 and parameters: {'max_depth': 13, 'min_child_weight': 12, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_lambda': 6.8783402579314314, 'reg_alpha': 0.2033644766268691, 'feature_fraction': 0.9878789645035975, 'bagging_fraction': 0.6602886451811982, 'bagging_freq': 4, 'num_leaves': 631, 'min_child_samples': 25}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.598261	valid_1's l1: 0.634992
[200]	training's l1: 0.594925	valid_1's l1: 0.630405
[300]	training's l1: 0.592051	valid_1's l1: 0.626943
[400]	training's l1: 0.590363	valid_1's l1: 0.626512
[500]	training's l1: 0.588921	valid_1's l1: 0.625815
[600]	training's l1: 0.586302	valid_1's l1: 0.624804
[700]	training's l1: 0.58397	valid_1's l1: 0.6235
[800]	training's l1: 0.581327	valid_1's l1: 0.622837
[900]	training's l1: 0.578217	valid_1's l1: 0.622317
[1000]	training's l1: 0.575707	valid_1's l1: 0.62218
Early stopping, best iteration is:
[975]	training's l1: 0.576231	valid_1's l1: 0.622057


[32m[I 2021-07-31 02:07:22,810][0m Trial 3 finished with value: -0.6216142138316754 and parameters: {'max_depth': 10, 'min_child_weight': 20, 'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_lambda': 177.05403857009262, 'reg_alpha': 0.7024580165225603, 'feature_fraction': 0.5584500529922273, 'bagging_fraction': 0.3985856919070816, 'bagging_freq': 15, 'num_leaves': 241, 'min_child_samples': 36}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.586552	valid_1's l1: 0.627265
[200]	training's l1: 0.584529	valid_1's l1: 0.625265
[300]	training's l1: 0.581754	valid_1's l1: 0.62358
[400]	training's l1: 0.577379	valid_1's l1: 0.622447
[500]	training's l1: 0.575246	valid_1's l1: 0.622579
Early stopping, best iteration is:
[423]	training's l1: 0.576881	valid_1's l1: 0.622319


[32m[I 2021-07-31 02:07:49,314][0m Trial 4 finished with value: -0.6220370881852941 and parameters: {'max_depth': 10, 'min_child_weight': 12, 'subsample': 0.6, 'colsample_bytree': 0.7, 'reg_lambda': 1.3597595250166703, 'reg_alpha': 0.020174808357093777, 'feature_fraction': 0.4868260487377721, 'bagging_fraction': 0.34850824318353607, 'bagging_freq': 17, 'num_leaves': 905, 'min_child_samples': 51}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.602487	valid_1's l1: 0.640902
[200]	training's l1: 0.598634	valid_1's l1: 0.636
[300]	training's l1: 0.598386	valid_1's l1: 0.635622
[400]	training's l1: 0.59705	valid_1's l1: 0.634357
[500]	training's l1: 0.596496	valid_1's l1: 0.633764
[600]	training's l1: 0.596147	valid_1's l1: 0.633654
[700]	training's l1: 0.594007	valid_1's l1: 0.631247
[800]	training's l1: 0.591964	valid_1's l1: 0.628995
[900]	training's l1: 0.591579	valid_1's l1: 0.628958
Early stopping, best iteration is:
[830]	training's l1: 0.591809	valid_1's l1: 0.628861


[32m[I 2021-07-31 02:08:59,749][0m Trial 5 finished with value: -0.6287278034597471 and parameters: {'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_lambda': 0.04958412431282435, 'reg_alpha': 169.10884229487104, 'feature_fraction': 0.5389590296141149, 'bagging_fraction': 0.9789781909641766, 'bagging_freq': 12, 'num_leaves': 376, 'min_child_samples': 39}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.648337	valid_1's l1: 0.695793
[200]	training's l1: 0.646536	valid_1's l1: 0.692955
[300]	training's l1: 0.643712	valid_1's l1: 0.688468
[400]	training's l1: 0.642902	valid_1's l1: 0.687331
[500]	training's l1: 0.641031	valid_1's l1: 0.683789
[600]	training's l1: 0.636153	valid_1's l1: 0.677585
[700]	training's l1: 0.634021	valid_1's l1: 0.674384
[800]	training's l1: 0.632204	valid_1's l1: 0.671739
[900]	training's l1: 0.632132	valid_1's l1: 0.671649
[1000]	training's l1: 0.631048	valid_1's l1: 0.670298
[1100]	training's l1: 0.63081	valid_1's l1: 0.67013
[1200]	training's l1: 0.630763	valid_1's l1: 0.67007
[1300]	training's l1: 0.629927	valid_1's l1: 0.668682
[1400]	training's l1: 0.629178	valid_1's l1: 0.667733
[1500]	training's l1: 0.628131	valid_1's l1: 0.666509
[1600]	training's l1: 0.627997	valid_1's l1: 0.666448
[1700]	training's l1: 0.627701	valid_1's l1: 0.666108
[1800]	training's l1: 0.627227	v

[32m[I 2021-07-31 02:10:11,120][0m Trial 6 finished with value: -0.6628611615442653 and parameters: {'max_depth': 2, 'min_child_weight': 1, 'subsample': 0.7, 'colsample_bytree': 0.9, 'reg_lambda': 613.9316604867469, 'reg_alpha': 476.23006817425966, 'feature_fraction': 0.6554805082249651, 'bagging_fraction': 0.6527663844756415, 'bagging_freq': 9, 'num_leaves': 982, 'min_child_samples': 84}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.648268	valid_1's l1: 0.696008
[200]	training's l1: 0.644075	valid_1's l1: 0.689385
[300]	training's l1: 0.641799	valid_1's l1: 0.686065
[400]	training's l1: 0.641053	valid_1's l1: 0.684622
[500]	training's l1: 0.640077	valid_1's l1: 0.682776
[600]	training's l1: 0.639973	valid_1's l1: 0.682686
[700]	training's l1: 0.639224	valid_1's l1: 0.681428
[800]	training's l1: 0.638674	valid_1's l1: 0.680602
[900]	training's l1: 0.63774	valid_1's l1: 0.67889
[1000]	training's l1: 0.637711	valid_1's l1: 0.678824
[1100]	training's l1: 0.636136	valid_1's l1: 0.67702
[1200]	training's l1: 0.633318	valid_1's l1: 0.67293
[1300]	training's l1: 0.631167	valid_1's l1: 0.669708
[1400]	training's l1: 0.630003	valid_1's l1: 0.667892
[1500]	training's l1: 0.62935	valid_1's l1: 0.667006
[1600]	training's l1: 0.628684	valid_1's l1: 0.665894
[1700]	training's l1: 0.628142	valid_1's l1: 0.665113
[1800]	training's l1: 0.627471	val

[32m[I 2021-07-31 02:11:17,341][0m Trial 7 finished with value: -0.6643088892433565 and parameters: {'max_depth': 2, 'min_child_weight': 8, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_lambda': 552.4535220226975, 'reg_alpha': 0.05555571498062016, 'feature_fraction': 0.6945709707798722, 'bagging_fraction': 0.8736058086368363, 'bagging_freq': 5, 'num_leaves': 492, 'min_child_samples': 8}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.579021	valid_1's l1: 0.622845
[200]	training's l1: 0.57452	valid_1's l1: 0.621911
[300]	training's l1: 0.56987	valid_1's l1: 0.622216
Early stopping, best iteration is:
[247]	training's l1: 0.572508	valid_1's l1: 0.62144


[32m[I 2021-07-31 02:11:59,167][0m Trial 8 finished with value: -0.6210863509429909 and parameters: {'max_depth': 19, 'min_child_weight': 6, 'subsample': 0.8, 'colsample_bytree': 0.7, 'reg_lambda': 0.10740212683038969, 'reg_alpha': 0.013169568451018975, 'feature_fraction': 0.5706982588829461, 'bagging_fraction': 0.34423214660771784, 'bagging_freq': 1, 'num_leaves': 989, 'min_child_samples': 32}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.590235	valid_1's l1: 0.629282
[200]	training's l1: 0.586028	valid_1's l1: 0.62536
[300]	training's l1: 0.585177	valid_1's l1: 0.62475
[400]	training's l1: 0.584503	valid_1's l1: 0.623982
[500]	training's l1: 0.583565	valid_1's l1: 0.623787
[600]	training's l1: 0.582636	valid_1's l1: 0.623285
[700]	training's l1: 0.581014	valid_1's l1: 0.622076
[800]	training's l1: 0.579877	valid_1's l1: 0.621655
[900]	training's l1: 0.578798	valid_1's l1: 0.62083
[1000]	training's l1: 0.57789	valid_1's l1: 0.62053
[1100]	training's l1: 0.576405	valid_1's l1: 0.620074
[1200]	training's l1: 0.575247	valid_1's l1: 0.620389
Early stopping, best iteration is:
[1100]	training's l1: 0.576405	valid_1's l1: 0.620074


[32m[I 2021-07-31 02:13:02,873][0m Trial 9 finished with value: -0.6197806337073454 and parameters: {'max_depth': 9, 'min_child_weight': 10, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_lambda': 1.6004101588986503, 'reg_alpha': 0.0038601298875795493, 'feature_fraction': 0.8001616716424675, 'bagging_fraction': 0.4674338143085763, 'bagging_freq': 4, 'num_leaves': 384, 'min_child_samples': 56}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.585294	valid_1's l1: 0.626653
[200]	training's l1: 0.579816	valid_1's l1: 0.623458
[300]	training's l1: 0.577171	valid_1's l1: 0.62287
[400]	training's l1: 0.574113	valid_1's l1: 0.622165
[500]	training's l1: 0.570438	valid_1's l1: 0.622169
[600]	training's l1: 0.567304	valid_1's l1: 0.621676
[700]	training's l1: 0.564565	valid_1's l1: 0.621394
Early stopping, best iteration is:
[675]	training's l1: 0.565014	valid_1's l1: 0.621252


[32m[I 2021-07-31 02:14:10,208][0m Trial 10 finished with value: -0.6209318051655174 and parameters: {'max_depth': 15, 'min_child_weight': 16, 'subsample': 0.5, 'colsample_bytree': 0.6, 'reg_lambda': 0.00125765707081413, 'reg_alpha': 12.681992077599057, 'feature_fraction': 0.3386625569884528, 'bagging_fraction': 0.7316473844564899, 'bagging_freq': 8, 'num_leaves': 714, 'min_child_samples': 8}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.585965	valid_1's l1: 0.625168
[200]	training's l1: 0.579914	valid_1's l1: 0.62177
[300]	training's l1: 0.576172	valid_1's l1: 0.621231
[400]	training's l1: 0.574234	valid_1's l1: 0.620572
[500]	training's l1: 0.571765	valid_1's l1: 0.620361
[600]	training's l1: 0.568788	valid_1's l1: 0.620407
Early stopping, best iteration is:
[502]	training's l1: 0.571687	valid_1's l1: 0.620204


[32m[I 2021-07-31 02:15:25,516][0m Trial 11 finished with value: -0.6199215411446811 and parameters: {'max_depth': 14, 'min_child_weight': 15, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_lambda': 5.337903316054354, 'reg_alpha': 0.0011181713531662229, 'feature_fraction': 0.9296310911782587, 'bagging_fraction': 0.4945786276822872, 'bagging_freq': 1, 'num_leaves': 670, 'min_child_samples': 62}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.603519	valid_1's l1: 0.639381
[200]	training's l1: 0.601158	valid_1's l1: 0.636569
[300]	training's l1: 0.600843	valid_1's l1: 0.636447
[400]	training's l1: 0.599324	valid_1's l1: 0.633858
[500]	training's l1: 0.59854	valid_1's l1: 0.632811
[600]	training's l1: 0.597128	valid_1's l1: 0.631237
[700]	training's l1: 0.595424	valid_1's l1: 0.629637
[800]	training's l1: 0.594005	valid_1's l1: 0.627285
[900]	training's l1: 0.59292	valid_1's l1: 0.626235
[1000]	training's l1: 0.5917	valid_1's l1: 0.625564
[1100]	training's l1: 0.590747	valid_1's l1: 0.625044
[1200]	training's l1: 0.58878	valid_1's l1: 0.623923
[1300]	training's l1: 0.587724	valid_1's l1: 0.623028
[1400]	training's l1: 0.586568	valid_1's l1: 0.622517
[1500]	training's l1: 0.585943	valid_1's l1: 0.622145
[1600]	training's l1: 0.585373	valid_1's l1: 0.621786
Early stopping, best iteration is:
[1544]	training's l1: 0.585618	valid_1's l1: 0.621698

[32m[I 2021-07-31 02:16:09,443][0m Trial 12 finished with value: -0.621416410732455 and parameters: {'max_depth': 6, 'min_child_weight': 14, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 24.377750942222562, 'reg_alpha': 0.0010682797245818987, 'feature_fraction': 0.8425276844976527, 'bagging_fraction': 0.21751999345247947, 'bagging_freq': 3, 'num_leaves': 637, 'min_child_samples': 20}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.617818	valid_1's l1: 0.656961
[200]	training's l1: 0.616604	valid_1's l1: 0.655524
[300]	training's l1: 0.612839	valid_1's l1: 0.650219
[400]	training's l1: 0.610595	valid_1's l1: 0.646755
[500]	training's l1: 0.607198	valid_1's l1: 0.641587
[600]	training's l1: 0.605644	valid_1's l1: 0.639417
[700]	training's l1: 0.602765	valid_1's l1: 0.636463
[800]	training's l1: 0.601027	valid_1's l1: 0.634335
[900]	training's l1: 0.59837	valid_1's l1: 0.631927
[1000]	training's l1: 0.596162	valid_1's l1: 0.629628
[1100]	training's l1: 0.595558	valid_1's l1: 0.629116
[1200]	training's l1: 0.59415	valid_1's l1: 0.628015
[1300]	training's l1: 0.592993	valid_1's l1: 0.626608
[1400]	training's l1: 0.59201	valid_1's l1: 0.62585
[1500]	training's l1: 0.591259	valid_1's l1: 0.625685
[1600]	training's l1: 0.590925	valid_1's l1: 0.625436
[1700]	training's l1: 0.590511	valid_1's l1: 0.624984
[1800]	training's l1: 0.589614	va

[32m[I 2021-07-31 02:17:18,494][0m Trial 13 finished with value: -0.6242358690059697 and parameters: {'max_depth': 13, 'min_child_weight': 18, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_lambda': 21.68468211856283, 'reg_alpha': 0.8424559879895793, 'feature_fraction': 0.8108013534706288, 'bagging_fraction': 0.49559437821034236, 'bagging_freq': 6, 'num_leaves': 51, 'min_child_samples': 57}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.590007	valid_1's l1: 0.628416
[200]	training's l1: 0.587713	valid_1's l1: 0.626477
[300]	training's l1: 0.587052	valid_1's l1: 0.626342
[400]	training's l1: 0.58567	valid_1's l1: 0.62562
[500]	training's l1: 0.584118	valid_1's l1: 0.624064
[600]	training's l1: 0.581365	valid_1's l1: 0.623349
[700]	training's l1: 0.578721	valid_1's l1: 0.622479
[800]	training's l1: 0.575794	valid_1's l1: 0.62159
[900]	training's l1: 0.570605	valid_1's l1: 0.620759
[1000]	training's l1: 0.564068	valid_1's l1: 0.620559
[1100]	training's l1: 0.557607	valid_1's l1: 0.620628
[1200]	training's l1: 0.552172	valid_1's l1: 0.620717
Early stopping, best iteration is:
[1162]	training's l1: 0.554657	valid_1's l1: 0.620487


[32m[I 2021-07-31 02:19:26,377][0m Trial 14 finished with value: -0.6201124711634325 and parameters: {'max_depth': 19, 'min_child_weight': 10, 'subsample': 0.9, 'colsample_bytree': 0.6, 'reg_lambda': 0.5599907897224119, 'reg_alpha': 0.005914242962169384, 'feature_fraction': 0.8157066178150348, 'bagging_fraction': 0.7714815179734043, 'bagging_freq': 4, 'num_leaves': 554, 'min_child_samples': 21}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.583297	valid_1's l1: 0.623795
[200]	training's l1: 0.578506	valid_1's l1: 0.621473
[300]	training's l1: 0.574954	valid_1's l1: 0.621191
[400]	training's l1: 0.572306	valid_1's l1: 0.62051
[500]	training's l1: 0.568963	valid_1's l1: 0.6203
[600]	training's l1: 0.567328	valid_1's l1: 0.620525
Early stopping, best iteration is:
[521]	training's l1: 0.568682	valid_1's l1: 0.620194


[32m[I 2021-07-31 02:20:21,780][0m Trial 15 finished with value: -0.6199192300479605 and parameters: {'max_depth': 12, 'min_child_weight': 3, 'subsample': 0.5, 'colsample_bytree': 0.6, 'reg_lambda': 0.006516237180317465, 'reg_alpha': 4.369122047202967, 'feature_fraction': 0.9970547226855334, 'bagging_fraction': 0.48408925922260326, 'bagging_freq': 7, 'num_leaves': 837, 'min_child_samples': 48}. Best is trial 2 with value: -0.6184027729146927.[0m


Training until validation scores don't improve for 100 rounds


In [23]:
print(f'best_score = {-study1.best_value}')
study1.best_trial.params

best_score = 0.6180401083217533


{'max_depth': 9,
 'min_child_weight': 9,
 'subsample': 0.7,
 'colsample_bytree': 0.9,
 'reg_lambda': 109.49725993851872,
 'reg_alpha': 0.2842054931824245,
 'feature_fraction': 0.7470211486482958,
 'bagging_fraction': 0.7586513780339648,
 'bagging_freq': 7,
 'num_leaves': 434,
 'min_child_samples': 48}

In [24]:
print(f'best_score = {-study2.best_value}')
study2.best_trial.params

best_score = 1.2181516540452384


{'max_depth': 16,
 'min_child_weight': 14,
 'subsample': 0.8,
 'colsample_bytree': 0.6,
 'reg_lambda': 0.05904093176873739,
 'reg_alpha': 0.14959954586688842,
 'feature_fraction': 0.8826130697812171,
 'bagging_fraction': 0.45193998198950835,
 'bagging_freq': 10,
 'num_leaves': 972,
 'min_child_samples': 54}

In [25]:
print(f'best_score = {-study3.best_value}')
study3.best_trial.params

best_score = 0.5133204252104776


{'max_depth': 9,
 'min_child_weight': 6,
 'subsample': 0.7,
 'colsample_bytree': 0.6,
 'reg_lambda': 0.05877749819391776,
 'reg_alpha': 0.0038642165449063654,
 'feature_fraction': 0.3062434024695542,
 'bagging_fraction': 0.2965667421406502,
 'bagging_freq': 20,
 'num_leaves': 392,
 'min_child_samples': 47}

In [26]:
print(f'best_score = {-study4.best_value}')
study4.best_trial.params

best_score = 0.9780929550957058


{'max_depth': 9,
 'min_child_weight': 16,
 'subsample': 0.9,
 'colsample_bytree': 0.9,
 'reg_lambda': 110.02569202000056,
 'reg_alpha': 0.06097900312536251,
 'feature_fraction': 0.9428289620551138,
 'bagging_fraction': 0.3437630483550768,
 'bagging_freq': 9,
 'num_leaves': 315,
 'min_child_samples': 35}