In [1]:
import gc
import os
import sys
import math
import random
import warnings
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)
from functools import reduce
import optuna

import pickle
from tqdm import tqdm
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# from pandarallel import pandarallel
# pandarallel.initialize()
warnings.simplefilter("ignore")
import ctypes as ct
from datetime import timedelta

from sklearn.metrics import mean_absolute_error

import statistics as st
import lightgbm as lgbm
from scipy.stats import norm

In [2]:
sys.path.append('../../')
import src.utils as utils

## Param

In [3]:
EXP_NUM = 68
NFOLDS = 5
SEED = 42
LAGS = list(range(1, 21))

In [4]:
utils.set_seed(SEED)

## Dir

In [5]:
DATA_DIR = Path("/home/knikaido/work/MLB-Player-Digital-Engagement-Forecasting/data/")
MAIN_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting'
TRAIN_DIR = MAIN_DATA_DIR / 'train'
OUTPUT_DIR = Path('./output/')

In [6]:
players = pd.read_csv(MAIN_DATA_DIR / 'players.csv')

rosters = pd.read_csv(TRAIN_DIR / 'rosters_train.csv')
targets = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv')
scores = pd.read_csv(TRAIN_DIR / 'playerBoxScores_train.csv')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()
seasons = pd.read_csv(MAIN_DATA_DIR / 'seasons.csv')
salaries = pd.read_csv(MAIN_DATA_DIR / 'mlbSalaries.csv')
teams = pd.read_csv(MAIN_DATA_DIR / 'teams.csv')

standings = pd.read_csv(TRAIN_DIR / 'standings_train.csv')
playerTwitterFollowers = pd.read_csv(TRAIN_DIR / 'playerTwitterFollowers_train.csv')

# events = pd.read_csv(TRAIN_DIR / 'events_train.csv')
# events = events.groupby(['gameDate']).sum().reset_index()


In [7]:
train_elements_dict = {"players":players, 
                       "rosters":rosters, 
                       "targets":targets, 
                       "scores":scores, 
                       "seasons":seasons, 
                       "teams":teams, 
                       "standings":standings}

In [8]:
def map_team_name(name):
    names = name.split('-')
    result = ''
    for n in names:
        if n == 'st':
            n = 'st.'
        result += f' {n.capitalize()}'

    return result[1:]

In [9]:
def calc_corr(df):
    # 相関係数行列を作成
    corr_mat = df.corr(method='pearson')

    # 行（列）サイズを取得
    n = corr_mat.shape[0]
    corr_ary = []

    for i in range(n):
        for j in range(i):
            if i == j:
                continue
            corr_ary.append(corr_mat.iloc[i,j])

    return corr_ary

In [10]:
def calc_probs(pid,df,temp):
    to_append=[pid,'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','']
    targets=['target1','target2','target3','target4']
    z=1
    for target in targets:
        target_prob = temp[target].tolist()
        mean = np.mean(target_prob)
        std = np.std(target_prob)
        median = st.median(target_prob)
        distribution = norm(mean, std)
        min_weight = min(target_prob)
        max_weight = max(target_prob)
        values = list(np.linspace(min_weight, max_weight))
        probabilities = [distribution.pdf(v) for v in values]
        max_value = max(probabilities)
        max_index = probabilities.index(max_value)
        to_append[z]=mean
        to_append[z+1]=median
        to_append[z+2]=std
        to_append[z+3]=min_weight
        to_append[z+4]=max_weight
        to_append[z+5]=temp[target].skew()
        to_append[z+6]=temp[target].kurt()

        z=z+7
    corr_ = calc_corr(temp[['target1', 'target2', 'target3', 'target4']])
    to_append[z:] = corr_  
    df_length = len(df)
    df.loc[df_length] = to_append
    return df

In [11]:
def count_consecutive_items_n_cols(df, col_name_list, output_col):
    cum_sum_list = [
        (df[col_name] != df[col_name].shift(1)).cumsum().tolist() for col_name in col_name_list
    ]
    df[output_col] = df.groupby(
        ["_".join(map(str, x)) for x in zip(*cum_sum_list)]
    ).cumcount() + 1
    return df

In [12]:
def extract_season(date_raw, season_start_end):
    idxes = 0
    for raw in season_start_end.iloc():
        idx_ = ((date_raw >= raw.iloc[0]) & (date_raw <= raw.iloc[1])) * 1
        idxes += idx_
    return idxes

In [13]:
# def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
#     oof_pred = np.zeros(len(y_valid), dtype=np.float32)
#     model = lgbm.LGBMRegressor(**params)
#     model.fit(x_train, y_train, 
#         eval_set=[(x_valid, y_valid)],  
#         early_stopping_rounds=verbose, 
#         verbose=verbose)
#     oof_pred = model.predict(x_valid)
#     oof_pred = np.clip(oof_pred, 0, 100)
#     score = mean_absolute_error(oof_pred, y_valid)
#     print('mae:', score)
#     return oof_pred, model, score

In [14]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, verbose=100):
    def opt(trial):
        params = {
                'random_state': SEED,
                'objective':'mae',
                'n_estimators': 10000,
                'learning_rate': 0.1,
                'max_depth': trial.suggest_int('max_depth', 1, 20),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
                'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
                'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1),
                'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 1e3),
                'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 1e3),
                'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
                'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
                'bagging_freq': trial.suggest_int('bagging_freq', 1, 10)
        }

        model_opt = lgbm.LGBMRegressor(**params)

        model_opt.fit(x_train, y_train, 
            eval_set=[(x_train, y_train), (x_valid, y_valid)],  
            early_stopping_rounds=verbose, 
            verbose=verbose)
        oof_pred = model_opt.predict(x_valid)
        oof_pred = np.clip(oof_pred, 0, 100)
        score = mean_absolute_error(oof_pred, y_valid)
        return -score
    return opt

In [15]:
def calc_target_stat(targets_train, stat_cols, tgt_column='playerId'):
    
    tgtId_list = targets_train[tgt_column].dropna().unique()
    target_probs = pd.DataFrame(columns = stat_cols)  
    for pid in tqdm(tgtId_list):
        temp = targets_train[targets_train[tgt_column] == pid]
        target_stats=calc_probs(pid, target_probs, temp)
        
    tgt_stat_cols = stat_cols
    tgt_stat_cols = tgt_stat_cols[:1] + [tgt_column + "_" + word for word in tgt_stat_cols[1:]]
    target_stats.columns = tgt_stat_cols
    
    return target_stats

In [16]:
def flatten(df, col):
    du = (df.pivot(index='playerId', columns='EvalDate', 
               values=col).add_prefix(f'{col}_').
      rename_axis(None, axis=1).reset_index())
    return du


def reducer(left, right):
    return left.merge(right, on='playerId')


def make_train_lag(df, lags):
    df['EvalDate'] = pd.to_datetime(df['date'], format="%Y%m%d")
    for lag in tqdm(lags):
        dp = df[['playerId','EvalDate'] + ['target1', 'target2', 'target3', 'target4']].copy()
        dp['EvalDate']  =dp['EvalDate'] + timedelta(days=lag) 
        df = df.merge(dp, on=['playerId', 'EvalDate'], suffixes=['',f'_{lag}'], how='left')
        gc.collect()
    df = df.sort_values(by=['playerId', 'EvalDate'])
    df = df.dropna()
    return df

def make_test_lag(sub, last):
    sub['playerId'] = sub['date_playerId'].apply(lambda s: int(  s.split('_')[1]  ) )
    assert sub.date.nunique() == 1
    dte = sub['date'].unique()[0]
    
    eval_dt = pd.to_datetime(dte, format='%Y%m%d')
    dtes = [eval_dt + timedelta(days = -k) for k in LAGS]
    mp_dtes = {eval_dt + timedelta(days = -k): k for k in LAGS}
    
    sl = last.loc[last['EvalDate'].between(dtes[-1], dtes[0]), ['EvalDate','playerId'] + ['target1', 'target2', 'target3', 'target4']].copy()
    sl['EvalDate'] = sl['EvalDate'].map(mp_dtes)
    du = [flatten(sl, col) for col in ['target1', 'target2', 'target3', 'target4']]
    du = reduce(reducer, du)
    return du, eval_dt

In [17]:
class Rt4kaidoTrain:
    def __init__(self, usetimelinefeature=False):
        
        self.usetimelinefeature = usetimelinefeature
        self.targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
        self.players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight', 'playerForTestSetAndFuturePreds']
        self.rosters_cols = ['playerId', 'teamId', 'status', 'date']
        self.salaries_cols = ['teamId', 'salary', 'year']
        self.standings_cols = ['teamId', 'wildCardRank', 'sportGamesBack', 'date']
        self.transactions_cols = ['playerId', 'transaction_flag', 'date']
        self.stat_cols = ["playerId", "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
                        "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
                        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
                        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt",
                        'tgt1_2_corr', 'tgt1_3_corr', 'tgt2_3_corr', 'tgt1_4_corr', 'tgt2_4_corr', 'tgt3_4_corr']
        self.lag_cols = [f'{col}_{lag}' for lag in reversed(LAGS) for col in ['target1', 'target2', 'target3', 'target4']] + ['playerId', 'date']


        self.scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances', 'date']

        self.feature_cols1 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols3 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 

        self.feature_cols4 = ['week_day', 'annual_day', 'month', 'label_playerId', 'label_primaryPositionName', 'label_teamId', 'label_birthCity',
                        'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight',
               'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
               'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
               'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
               'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
               'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
               'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
               'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
               'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
               'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
               'groundOutsPitching', 'runsPitching', 'doublesPitching',
               'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
               'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
               'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
               'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
               'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
               'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
               'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
               'inheritedRunnersScored', 'catchersInterferencePitching',
               'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
               'assists', 'putOuts', 'errors', 'chances',
                'season_info', 'wildCardRank'] 
        
        
        # lightgbm
        self.params1 = {
                    'random_state': SEED,
                    'objective':'mae',
                    'n_estimators': 10000,
                    'learning_rate': 0.1,
                    'max_depth': 5,
                    'min_child_weight': 2,
                    'subsample': 0.8,
                    'colsample_bytree': 0.7,
                    'reg_lambda': 0.2219989029856233,
                    'reg_alpha': 0.003911382788589902,
                    'feature_fraction': 0.8972775633184858,
                    'bagging_fraction': 0.9080490178534212,
                    'bagging_freq': 1
        }


        self.params2 = {
                    'random_state': SEED,
                    'objective':'mae',
                    'n_estimators': 10000,
                    'learning_rate': 0.1,
                    'max_depth': 16,
                    'min_child_weight': 16,
                    'subsample': 0.8,
                    'colsample_bytree': 0.8,
                    'reg_lambda': 0.14788207783997104,
                    'reg_alpha': 0.03142098321958639,
                    'feature_fraction': 0.7277188813412163,
                    'bagging_fraction': 0.7968075536678682,
                    'bagging_freq': 7
        }


        self.params3 = {
                    'random_state': SEED,
                    'objective':'mae',
                    'n_estimators': 10000,
                    'learning_rate': 0.1,
                    'max_depth': 4,
                    'min_child_weight': 8,
                    'subsample': 0.9,
                    'colsample_bytree': 0.8,
                    'reg_lambda': 0.06237286017377033,
                    'reg_alpha': 0.025330255862048567,
                    'feature_fraction': 0.95826235847118,
                    'bagging_fraction': 0.613475530706581,
                    'bagging_freq': 8
        }

        self.params4 = {
                    'random_state': SEED,
                    'objective':'mae',
                    'n_estimators': 10000,
                    'learning_rate': 0.1,
                    'max_depth': 11,
                    'min_child_weight': 4,
                    'subsample': 0.7,
                    'colsample_bytree': 0.6,
                    'reg_lambda': 0.009794699765699343,
                    'reg_alpha': 2.394314077427591,
                    'feature_fraction': 0.5417549198679318,
                    'bagging_fraction': 0.5603631731851066,
                    'bagging_freq': 5
        }

    def make_feature(self, train_elements_dict):

        players = train_elements_dict['players']
        rosters = train_elements_dict['rosters']
        targets = train_elements_dict['targets']
        scores = train_elements_dict['scores']
        seasons = train_elements_dict['seasons']
        teams = train_elements_dict['teams']
        standings = train_elements_dict['standings']

        print('calc target stat ... ', end="")

        ## target stats
        targets_train = targets.merge(rosters[self.rosters_cols], on=['playerId', 'date'], how='left')
#         targets_train = targets_train.merge(scores[self.scores_cols], on=['playerId', 'date'], how='left')
#         targets_train['gameday'] = ~targets_train['battingOrder'].isna()*1
        targets_train = targets_train[(targets_train['date'] >= 20210401)]

        player_target_stats = calc_target_stat(targets_train, self.stat_cols, tgt_column='playerId')
        player_stat_cols = list(player_target_stats.columns)
#         player_stat_cols = player_stat_cols[:1] + ['gameday' + "_" + word for word in player_stat_cols[1:]]
#         player_target_stats_gameday.columns = player_stat_cols
        self.feature_cols1 += player_stat_cols[1:-6]
        self.feature_cols2 += player_stat_cols[1:-6]
        self.feature_cols3 += player_stat_cols[1:-6]
        self.feature_cols4 += player_stat_cols[1:-6]
        
        team_target_stats = calc_target_stat(targets_train, self.stat_cols, tgt_column='teamId')
        team_stat_cols = list(team_target_stats.columns)
#         team_stat_cols = team_stat_cols[:1] + ['gameday' + "_" + word for word in team_stat_cols[1:]]
#         team_target_stats_gameday.columns = team_stat_cols
        self.feature_cols1 += team_stat_cols[1:]
        self.feature_cols2 += team_stat_cols[1:]
        self.feature_cols3 += team_stat_cols[1:]
        self.feature_cols4 += team_stat_cols[1:]
                

        print('done.')

        print('preprocess ... ', end="")
        ## salaries
        # salaries = salaries.groupby(['year', 'team']).sum()['salary'].reset_index()
        # salaries['team'] = salaries['team'].apply(map_team_name)
        # salaries = salaries.merge(teams, left_on='team', right_on='name', how='inner')
        # salaries = salaries.rename(columns={'id': 'teamId'})

        ## seasons
        seasons = seasons.fillna('0000-00-00')
        for c_ in seasons.columns[1:]:
            seasons[c_] = seasons[c_].str.replace('-', '').astype(int)

        ## players
        players['DOY'] = pd.to_datetime(players['DOB'], format="%Y-%m-%d").dt.year
        players['mlbDebutYear'] = pd.to_datetime(players['mlbDebutDate'], format="%Y-%m-%d").dt.year
        players['DebutAge'] = players['mlbDebutYear'] - players['DOY']

        print('done.')

        print('creat feature ... ', end="")
        # creat feature
        train = targets[self.targets_cols].merge(players[self.players_cols], on=['playerId'], how='left')
        train = train.merge(rosters[self.rosters_cols], on=['playerId', 'date'], how='left')
        train = train.merge(scores[self.scores_cols], on=['playerId', 'date'], how='left')
        train = train.merge(standings[self.standings_cols], on=['teamId', 'date'], how='left')
        train = train.merge(player_target_stats, how='left', left_on=["playerId"],right_on=["playerId"])
        train = train.merge(team_target_stats, how='left', left_on=["teamId"],right_on=["playerId"], suffixes=('', 'team_'))
        date_ = pd.to_datetime(train['date'], format="%Y%m%d")
        train['annual_day'] = (date_ - pd.to_datetime(date_.dt.year, format="%Y")) /  timedelta(days=1)
        train['week_day'] = date_.dt.weekday
        train['month'] = date_.dt.month
        train['year'] = date_.dt.year

        # label encoding
        player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
        position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
        birthCityn2num = {c: i for i, c in enumerate(train['birthCity'].unique())}
        teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
        status2num = {c: i for i, c in enumerate(train['status'].unique())}
        train['label_playerId'] = train['playerId'].map(player2num)
        train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
        train['label_birthCity'] = train['birthCity'].map(birthCityn2num)
        train['label_teamId'] = train['teamId'].map(teamid2num)
        train['label_status'] = train['status'].map(status2num)

        ## season_info
        on_preseason_idxes = extract_season(train['date'], seasons[['preSeasonStartDate', 'preSeasonEndDate']])
        on_season_idxes = extract_season(train['date'], seasons[['regularSeasonStartDate', 'regularSeasonEndDate']]) * 2
        on_postseason_idxes = extract_season(train['date'], seasons[['postSeasonStartDate', 'postSeasonEndDate']]) * 3

        special_days = seasons['lastDate1stHalf'].to_list() + seasons['allStarDate'].to_list() + seasons['firstDate2ndHalf'].to_list()
        special_idxes = 0
        for day in special_days:
            special_idxes += (train['date'] == day) * 4

        on_total_season_idxes = on_preseason_idxes
        on_total_season_idxes[on_season_idxes==2] = 2
        on_total_season_idxes[on_postseason_idxes==3] = 3
        on_total_season_idxes[special_idxes==4] = 4

        train['season_info'] = on_total_season_idxes

        ## only on season
        on_whole_idxes = extract_season(train['date'], seasons[['seasonStartDate', 'seasonEndDate']])
        train = train[on_whole_idxes == 1].reset_index(drop=True)

        # train = train.merge(playerTwitterFollowers, how='left', on=["playerId", 'date'])

        ## game_info
        train['gameday'] = ~train['battingOrder'].isna()*1
        train.sort_values(by=['playerId','date'],inplace=True,ascending=True)

        train=count_consecutive_items_n_cols(train,['playerId','gameday'],'daysSinceLastGame')
        train.loc[train['gameday']==1,'daysSinceLastGame'] = 0

        train_game = train[train['gameday']==1]
        train_last_game = train_game[~train_game.duplicated(subset='playerId', keep='last')][['playerId', 'date']]
        train_last_game.columns = ['playerId', 'lastdate']
        train_player_unique = pd.DataFrame(train['playerId'].unique(), columns=['playerId'])
        train_last_game = pd.merge(train_player_unique, train_last_game, on=['playerId'], how='left' )
        train_last_game = train_last_game.fillna(20171231)

        
        train_features_dict = {'players': players,
                                'player_target_stats': player_target_stats,
                                'team_target_stats': team_target_stats,
                                'train_last_game': train_last_game, 
                                'player2num': player2num, 
                                'position2num': position2num, 
                                'birthCityn2num': birthCityn2num,
                                'teamid2num': teamid2num,
                                'status2num': status2num,
                                'feature_cols1': self.feature_cols1,
                                'feature_cols2': self.feature_cols2,
                                'feature_cols3': self.feature_cols3,
                                'feature_cols4': self.feature_cols4
                              }
        
        if self.usetimelinefeature:
            train_features_dict['train_last_game'] = train_last_game
            self.feature_cols1 += ['daysSinceLastGame']
            self.feature_cols2 += ['daysSinceLastGame']
            self.feature_cols3 += ['daysSinceLastGame']
            self.feature_cols4 += ['daysSinceLastGame']
            
            targets_lags = make_train_lag(targets, LAGS)
            train = train.merge(targets_lags[self.lag_cols], on=['playerId', 'date'], how='left')
            targets_lags = targets_lags[targets_lags['date'] >= 20210101].reset_index(drop=True)
            del targets_lags['index']
            train_features_dict['targets_lags'] = targets_lags
            self.feature_cols1 += [f'{col}_{lag}' for lag in reversed(LAGS) for col in ['target1', 'target2', 'target3', 'target4']]
            self.feature_cols2 += [f'{col}_{lag}' for lag in reversed(LAGS) for col in ['target1', 'target2', 'target3', 'target4']]
            self.feature_cols3 += [f'{col}_{lag}' for lag in reversed(LAGS) for col in ['target1', 'target2', 'target3', 'target4']]
            self.feature_cols4 += [f'{col}_{lag}' for lag in reversed(LAGS) for col in ['target1', 'target2', 'target3', 'target4']]
            
            print('done.')

        return train, train_features_dict
    
    def train_and_evaluate(self, train, isgamedayonly=False):
        
        if isgamedayonly:
            train = train[train['gameday'] == 1].reset_index(drop=True)
            
        train_X = train
        train_y = train[['target1', 'target2', 'target3', 'target4']]

        oof = np.zeros(train_y.shape) - 1.0
        y_valids = np.zeros(train_y.shape) - 1.0

        tr_idx = (train['date'].astype(int) < 20210401)
        val_idx = ~tr_idx

        x_train = train_X.loc[tr_idx].reset_index(drop=True)
        y_train = train_y.loc[tr_idx].reset_index(drop=True)
        x_valid = train_X.loc[val_idx].reset_index(drop=True)
        y_valid = train_y.loc[val_idx].reset_index(drop=True)

        study1 = optuna.create_study(direction='maximize')
        study1.optimize(fit_lgbm(x_train[self.feature_cols1], y_train['target1'], 
                                 x_valid[self.feature_cols1], y_valid['target1']), n_trials=100)
        
        study2 = optuna.create_study(direction='maximize')
        study2.optimize(fit_lgbm(x_train[self.feature_cols2], y_train['target2'], 
                                 x_valid[self.feature_cols2], y_valid['target2']), n_trials=100)
        
        study3 = optuna.create_study(direction='maximize')
        study3.optimize(fit_lgbm(x_train[self.feature_cols3], y_train['target3'], 
                                 x_valid[self.feature_cols3], y_valid['target3']), n_trials=100)
        
        study4 = optuna.create_study(direction='maximize')
        study4.optimize(fit_lgbm(x_train[self.feature_cols4], y_train['target4'], 
                                 x_valid[self.feature_cols4], y_valid['target4']), n_trials=100)

#         oof1, model1, score1 = fit_lgbm(
#             x_train[self.feature_cols1], y_train['target1'],
#             x_valid[self.feature_cols1], y_valid['target1'],
# #             self.params1
#         )
#         oof2, model2, score2 = fit_lgbm(
#             x_train[self.feature_cols2], y_train['target2'],
#             x_valid[self.feature_cols2], y_valid['target2'],
# #             self.params2
#         )
#         oof3, model3, score3 = fit_lgbm(
#             x_train[self.feature_cols3], y_train['target3'],
#             x_valid[self.feature_cols3], y_valid['target3'],
# #             self.params3
#         )
#         oof4, model4, score4 = fit_lgbm(
#             x_train[self.feature_cols4], y_train['target4'],
#             x_valid[self.feature_cols4], y_valid['target4'],
# #             self.params4
#         )

#         score = (score1+score2+score3+score4) / 4
#         print(f'score: {score}')

#         oof[val_idx, 0] = oof1
#         oof[val_idx, 1] = oof2
#         oof[val_idx, 2] = oof3
#         oof[val_idx, 3] = oof4
#         y_valids[val_idx, 0] = y_valid['target1'].values
#         y_valids[val_idx, 1] = y_valid['target2'].values
#         y_valids[val_idx, 2] = y_valid['target3'].values
#         y_valids[val_idx, 3] = y_valid['target4'].values

#         mae = mean_absolute_error(y_valids[val_idx, :], oof[val_idx, :])
#         print("mae:", mae)

#         val_idx_num = val_idx[val_idx==True].index.to_list()

#         oof_df = train[self.targets_cols]
#         oof_df.iloc[val_idx_num, 1:5] = oof[val_idx_num, :]

#         models = np.array([model1, model2, model3, model4])

        return study1, study2, study3, study4
            

In [18]:
rt4kaido_train = Rt4kaidoTrain(usetimelinefeature=True)
train, train_features_dict = rt4kaido_train.make_feature(train_elements_dict)

calc target stat ... 

100%|██████████| 2061/2061 [00:46<00:00, 44.59it/s]
100%|██████████| 30/30 [00:00<00:00, 41.67it/s]


done.
preprocess ... done.
creat feature ... 

100%|██████████| 20/20 [00:25<00:00,  1.27s/it]


done.


In [None]:
study1, study2, study3, study4 = rt4kaido_train.train_and_evaluate(train, isgamedayonly=False)

[32m[I 2021-07-14 01:44:50,000][0m A new study created in memory with name: no-name-9667ca66-655b-41c7-a2a0-c4f39d82c7ba[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.564894	valid_1's l1: 0.655065
[200]	training's l1: 0.561139	valid_1's l1: 0.650701
[300]	training's l1: 0.558349	valid_1's l1: 0.647436
[400]	training's l1: 0.557348	valid_1's l1: 0.645762
[500]	training's l1: 0.556111	valid_1's l1: 0.644732
[600]	training's l1: 0.555216	valid_1's l1: 0.643532
[700]	training's l1: 0.553992	valid_1's l1: 0.642313
Early stopping, best iteration is:
[646]	training's l1: 0.554114	valid_1's l1: 0.642199


[32m[I 2021-07-14 01:45:27,053][0m Trial 0 finished with value: -0.6418715981213559 and parameters: {'max_depth': 14, 'min_child_weight': 12, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_lambda': 834.5697475037534, 'reg_alpha': 0.007162174027091825, 'feature_fraction': 0.6413594113637033, 'bagging_fraction': 0.4187285868648857, 'bagging_freq': 4}. Best is trial 0 with value: -0.6418715981213559.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.572223	valid_1's l1: 0.66587
[200]	training's l1: 0.5626	valid_1's l1: 0.653162
[300]	training's l1: 0.559452	valid_1's l1: 0.648411
[400]	training's l1: 0.557575	valid_1's l1: 0.645997
[500]	training's l1: 0.555805	valid_1's l1: 0.642919
[600]	training's l1: 0.5551	valid_1's l1: 0.641973
[700]	training's l1: 0.553632	valid_1's l1: 0.639662
[800]	training's l1: 0.551562	valid_1's l1: 0.637548
[900]	training's l1: 0.550329	valid_1's l1: 0.636287
[1000]	training's l1: 0.549571	valid_1's l1: 0.635713
[1100]	training's l1: 0.549089	valid_1's l1: 0.635008
[1200]	training's l1: 0.548387	valid_1's l1: 0.634335
[1300]	training's l1: 0.547725	valid_1's l1: 0.63404
[1400]	training's l1: 0.547373	valid_1's l1: 0.633531
[1500]	training's l1: 0.547089	valid_1's l1: 0.633033
[1600]	training's l1: 0.546502	valid_1's l1: 0.632454
[1700]	training's l1: 0.546134	valid_1's l1: 0.631954
[1800]	training's l1: 0.545859	vali

[32m[I 2021-07-14 01:49:10,662][0m Trial 1 finished with value: -0.6239653851646876 and parameters: {'max_depth': 3, 'min_child_weight': 17, 'subsample': 0.8, 'colsample_bytree': 0.9, 'reg_lambda': 0.002261020270215396, 'reg_alpha': 0.001429661411472613, 'feature_fraction': 0.9045488369053092, 'bagging_fraction': 0.5534054979530708, 'bagging_freq': 2}. Best is trial 1 with value: -0.6239653851646876.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.571713	valid_1's l1: 0.663952
[200]	training's l1: 0.56919	valid_1's l1: 0.660752
[300]	training's l1: 0.568091	valid_1's l1: 0.659276
[400]	training's l1: 0.567158	valid_1's l1: 0.657954
[500]	training's l1: 0.566159	valid_1's l1: 0.656914
[600]	training's l1: 0.565487	valid_1's l1: 0.656119
[700]	training's l1: 0.564834	valid_1's l1: 0.655335
[800]	training's l1: 0.564334	valid_1's l1: 0.654904
[900]	training's l1: 0.563931	valid_1's l1: 0.654232
[1000]	training's l1: 0.563329	valid_1's l1: 0.653435
[1100]	training's l1: 0.562903	valid_1's l1: 0.652861
[1200]	training's l1: 0.562483	valid_1's l1: 0.652192
[1300]	training's l1: 0.562118	valid_1's l1: 0.651852
[1400]	training's l1: 0.561947	valid_1's l1: 0.651386
[1500]	training's l1: 0.561613	valid_1's l1: 0.650959
[1600]	training's l1: 0.561443	valid_1's l1: 0.650556
[1700]	training's l1: 0.561237	valid_1's l1: 0.650388
[1800]	training's l1: 0.56117	

[32m[I 2021-07-14 01:52:59,659][0m Trial 2 finished with value: -0.6474282275366788 and parameters: {'max_depth': 12, 'min_child_weight': 9, 'subsample': 0.6, 'colsample_bytree': 0.5, 'reg_lambda': 13.605939848807163, 'reg_alpha': 595.2725244497872, 'feature_fraction': 0.5014238621099723, 'bagging_fraction': 0.4138373416692759, 'bagging_freq': 1}. Best is trial 1 with value: -0.6239653851646876.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.56314	valid_1's l1: 0.653926
[200]	training's l1: 0.562761	valid_1's l1: 0.653028
[300]	training's l1: 0.562336	valid_1's l1: 0.652694
[400]	training's l1: 0.56188	valid_1's l1: 0.651978


[32m[I 2021-07-14 01:53:28,580][0m Trial 3 finished with value: -0.6519432470220304 and parameters: {'max_depth': 19, 'min_child_weight': 15, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 0.012203366097362991, 'reg_alpha': 41.00687396115279, 'feature_fraction': 0.7980985941925297, 'bagging_fraction': 0.4013868071115955, 'bagging_freq': 5}. Best is trial 1 with value: -0.6239653851646876.[0m


Early stopping, best iteration is:
[398]	training's l1: 0.56188	valid_1's l1: 0.651978
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.563631	valid_1's l1: 0.655944
[200]	training's l1: 0.562103	valid_1's l1: 0.653616
[300]	training's l1: 0.560534	valid_1's l1: 0.651274
[400]	training's l1: 0.559974	valid_1's l1: 0.650647
[500]	training's l1: 0.55906	valid_1's l1: 0.649545
[600]	training's l1: 0.558613	valid_1's l1: 0.648848
[700]	training's l1: 0.558605	valid_1's l1: 0.648846
[800]	training's l1: 0.558293	valid_1's l1: 0.648586
[900]	training's l1: 0.55826	valid_1's l1: 0.648554
[1000]	training's l1: 0.557578	valid_1's l1: 0.647607
[1100]	training's l1: 0.556922	valid_1's l1: 0.64641
[1200]	training's l1: 0.554803	valid_1's l1: 0.643401
[1300]	training's l1: 0.553656	valid_1's l1: 0.641042
[1400]	training's l1: 0.553349	valid_1's l1: 0.640656
[1500]	training's l1: 0.552911	valid_1's l1: 0.640044
[1600]	training's l1: 0.552726	valid_1's l1: 0.639643

[32m[I 2021-07-14 01:58:08,098][0m Trial 4 finished with value: -0.6256323916835402 and parameters: {'max_depth': 14, 'min_child_weight': 3, 'subsample': 0.6, 'colsample_bytree': 0.9, 'reg_lambda': 0.23783754125367532, 'reg_alpha': 0.005228631604598618, 'feature_fraction': 0.9697364370008478, 'bagging_fraction': 0.6216540109598578, 'bagging_freq': 7}. Best is trial 1 with value: -0.6239653851646876.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.563645	valid_1's l1: 0.655883
[200]	training's l1: 0.563645	valid_1's l1: 0.655883


[32m[I 2021-07-14 01:58:29,456][0m Trial 5 finished with value: -0.65585371030115 and parameters: {'max_depth': 20, 'min_child_weight': 10, 'subsample': 0.5, 'colsample_bytree': 0.9, 'reg_lambda': 0.31616868309908347, 'reg_alpha': 0.17971922890884978, 'feature_fraction': 0.633631603752297, 'bagging_fraction': 0.9361907240990511, 'bagging_freq': 1}. Best is trial 1 with value: -0.6239653851646876.[0m


Early stopping, best iteration is:
[112]	training's l1: 0.563645	valid_1's l1: 0.655883
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.67177	valid_1's l1: 0.782915
[200]	training's l1: 0.67177	valid_1's l1: 0.782915
[300]	training's l1: 0.67177	valid_1's l1: 0.782915
[400]	training's l1: 0.67177	valid_1's l1: 0.782915


[32m[I 2021-07-14 01:58:46,724][0m Trial 6 finished with value: -0.7829151722430991 and parameters: {'max_depth': 1, 'min_child_weight': 19, 'subsample': 0.6, 'colsample_bytree': 0.8, 'reg_lambda': 369.5874986083517, 'reg_alpha': 80.04152897444875, 'feature_fraction': 0.6989933727038948, 'bagging_fraction': 0.6438050786678788, 'bagging_freq': 5}. Best is trial 1 with value: -0.6239653851646876.[0m


Early stopping, best iteration is:
[310]	training's l1: 0.67177	valid_1's l1: 0.782915
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.561572	valid_1's l1: 0.651645
[200]	training's l1: 0.559988	valid_1's l1: 0.649001
[300]	training's l1: 0.559298	valid_1's l1: 0.648028
[400]	training's l1: 0.559241	valid_1's l1: 0.647951
[500]	training's l1: 0.558559	valid_1's l1: 0.647017
[600]	training's l1: 0.558104	valid_1's l1: 0.646402
[700]	training's l1: 0.557634	valid_1's l1: 0.645546
[800]	training's l1: 0.557554	valid_1's l1: 0.645346
[900]	training's l1: 0.557554	valid_1's l1: 0.645346
[1000]	training's l1: 0.557429	valid_1's l1: 0.645153
[1100]	training's l1: 0.557429	valid_1's l1: 0.64515
Early stopping, best iteration is:
[1020]	training's l1: 0.557429	valid_1's l1: 0.64515


[32m[I 2021-07-14 02:00:24,504][0m Trial 7 finished with value: -0.6451325219210037 and parameters: {'max_depth': 16, 'min_child_weight': 6, 'subsample': 0.7, 'colsample_bytree': 0.6, 'reg_lambda': 0.48567083665353084, 'reg_alpha': 0.02601863517960579, 'feature_fraction': 0.9290788717625078, 'bagging_fraction': 0.7292433267691041, 'bagging_freq': 3}. Best is trial 1 with value: -0.6239653851646876.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.594655	valid_1's l1: 0.695237
[200]	training's l1: 0.587732	valid_1's l1: 0.686958
[300]	training's l1: 0.583346	valid_1's l1: 0.681046
[400]	training's l1: 0.580821	valid_1's l1: 0.677624
[500]	training's l1: 0.579717	valid_1's l1: 0.676124
[600]	training's l1: 0.578637	valid_1's l1: 0.674495
[700]	training's l1: 0.577795	valid_1's l1: 0.673282
[800]	training's l1: 0.577383	valid_1's l1: 0.672602
[900]	training's l1: 0.577022	valid_1's l1: 0.671909
[1000]	training's l1: 0.576388	valid_1's l1: 0.671035
[1100]	training's l1: 0.576217	valid_1's l1: 0.670753
[1200]	training's l1: 0.576012	valid_1's l1: 0.670309
[1300]	training's l1: 0.5758	valid_1's l1: 0.669987
[1400]	training's l1: 0.57561	valid_1's l1: 0.669586
[1500]	training's l1: 0.575554	valid_1's l1: 0.669408
[1600]	training's l1: 0.575465	valid_1's l1: 0.669274
[1700]	training's l1: 0.575183	valid_1's l1: 0.668739
[1800]	training's l1: 0.57506	va

[32m[I 2021-07-14 02:01:48,844][0m Trial 8 finished with value: -0.6676663200014623 and parameters: {'max_depth': 2, 'min_child_weight': 16, 'subsample': 0.5, 'colsample_bytree': 0.5, 'reg_lambda': 0.0016522117248058369, 'reg_alpha': 0.0036426296047951247, 'feature_fraction': 0.7980183685342417, 'bagging_fraction': 0.5544189451550221, 'bagging_freq': 10}. Best is trial 1 with value: -0.6239653851646876.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.561005	valid_1's l1: 0.650978
[200]	training's l1: 0.560824	valid_1's l1: 0.650759
[300]	training's l1: 0.560824	valid_1's l1: 0.650759


[32m[I 2021-07-14 02:02:20,251][0m Trial 9 finished with value: -0.650708088315562 and parameters: {'max_depth': 9, 'min_child_weight': 7, 'subsample': 0.7, 'colsample_bytree': 0.6, 'reg_lambda': 0.17938753504847493, 'reg_alpha': 0.0301580806761221, 'feature_fraction': 0.4895360095320663, 'bagging_fraction': 0.7599217102478333, 'bagging_freq': 1}. Best is trial 1 with value: -0.6239653851646876.[0m


Early stopping, best iteration is:
[233]	training's l1: 0.560824	valid_1's l1: 0.650759
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.554588	valid_1's l1: 0.644235
[200]	training's l1: 0.548944	valid_1's l1: 0.63659
[300]	training's l1: 0.545042	valid_1's l1: 0.63209
[400]	training's l1: 0.541914	valid_1's l1: 0.628839
[500]	training's l1: 0.539203	valid_1's l1: 0.6267
[600]	training's l1: 0.537709	valid_1's l1: 0.625484
[700]	training's l1: 0.535884	valid_1's l1: 0.623598
[800]	training's l1: 0.534283	valid_1's l1: 0.62216
[900]	training's l1: 0.533577	valid_1's l1: 0.621552
[1000]	training's l1: 0.532788	valid_1's l1: 0.620849
[1100]	training's l1: 0.531934	valid_1's l1: 0.620364
[1200]	training's l1: 0.530927	valid_1's l1: 0.619599
[1300]	training's l1: 0.53041	valid_1's l1: 0.619268
[1400]	training's l1: 0.530087	valid_1's l1: 0.619026
[1500]	training's l1: 0.529154	valid_1's l1: 0.618564
[1600]	training's l1: 0.528931	valid_1's l1: 0.618486
[

[32m[I 2021-07-14 02:05:13,280][0m Trial 10 finished with value: -0.616955853313987 and parameters: {'max_depth': 6, 'min_child_weight': 20, 'subsample': 0.9, 'colsample_bytree': 0.7, 'reg_lambda': 0.0025590610175593173, 'reg_alpha': 3.339466913120142, 'feature_fraction': 0.8656147754738719, 'bagging_fraction': 0.8554769813752094, 'bagging_freq': 8}. Best is trial 10 with value: -0.616955853313987.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.550094	valid_1's l1: 0.64104
[200]	training's l1: 0.543424	valid_1's l1: 0.634362
[300]	training's l1: 0.538744	valid_1's l1: 0.629251
[400]	training's l1: 0.536239	valid_1's l1: 0.626088
[500]	training's l1: 0.534059	valid_1's l1: 0.624187
[600]	training's l1: 0.532742	valid_1's l1: 0.623226
[700]	training's l1: 0.531526	valid_1's l1: 0.622083
[800]	training's l1: 0.530482	valid_1's l1: 0.6214
[900]	training's l1: 0.529769	valid_1's l1: 0.620861
[1000]	training's l1: 0.529078	valid_1's l1: 0.620495
[1100]	training's l1: 0.528364	valid_1's l1: 0.620021
[1200]	training's l1: 0.527873	valid_1's l1: 0.619455
[1300]	training's l1: 0.527479	valid_1's l1: 0.619166
[1400]	training's l1: 0.52693	valid_1's l1: 0.618992
[1500]	training's l1: 0.526377	valid_1's l1: 0.618624
[1600]	training's l1: 0.526235	valid_1's l1: 0.618489
[1700]	training's l1: 0.526167	valid_1's l1: 0.618465
Early stopping, best iteration is

[32m[I 2021-07-14 02:07:47,115][0m Trial 11 finished with value: -0.6183607250801365 and parameters: {'max_depth': 5, 'min_child_weight': 20, 'subsample': 0.9, 'colsample_bytree': 0.7, 'reg_lambda': 0.0010822208179335896, 'reg_alpha': 3.0872427383723338, 'feature_fraction': 0.862154012955894, 'bagging_fraction': 0.8974846173733539, 'bagging_freq': 8}. Best is trial 10 with value: -0.616955853313987.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.555682	valid_1's l1: 0.64463
[200]	training's l1: 0.549821	valid_1's l1: 0.636102
[300]	training's l1: 0.543921	valid_1's l1: 0.629852
[400]	training's l1: 0.538805	valid_1's l1: 0.625186
[500]	training's l1: 0.536513	valid_1's l1: 0.623027
[600]	training's l1: 0.532064	valid_1's l1: 0.619818
[700]	training's l1: 0.530235	valid_1's l1: 0.618817
[800]	training's l1: 0.529581	valid_1's l1: 0.618325
[900]	training's l1: 0.528628	valid_1's l1: 0.61787
[1000]	training's l1: 0.528072	valid_1's l1: 0.617643
Early stopping, best iteration is:
[972]	training's l1: 0.528175	valid_1's l1: 0.61764


[32m[I 2021-07-14 02:09:29,262][0m Trial 12 finished with value: -0.6175423421626197 and parameters: {'max_depth': 7, 'min_child_weight': 20, 'subsample': 0.9, 'colsample_bytree': 0.7, 'reg_lambda': 0.01418368233926691, 'reg_alpha': 2.9749566007234756, 'feature_fraction': 0.8262902623711217, 'bagging_fraction': 0.928217715641226, 'bagging_freq': 8}. Best is trial 10 with value: -0.616955853313987.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.556701	valid_1's l1: 0.645686
[200]	training's l1: 0.550977	valid_1's l1: 0.637322
[300]	training's l1: 0.548285	valid_1's l1: 0.634255
[400]	training's l1: 0.54505	valid_1's l1: 0.630893
[500]	training's l1: 0.543116	valid_1's l1: 0.629339
[600]	training's l1: 0.54243	valid_1's l1: 0.628667
[700]	training's l1: 0.538782	valid_1's l1: 0.625378
[800]	training's l1: 0.537617	valid_1's l1: 0.6244
[900]	training's l1: 0.536874	valid_1's l1: 0.624104
[1000]	training's l1: 0.535595	valid_1's l1: 0.622765
[1100]	training's l1: 0.532752	valid_1's l1: 0.620083
[1200]	training's l1: 0.532274	valid_1's l1: 0.620032
[1300]	training's l1: 0.531985	valid_1's l1: 0.619846
[1400]	training's l1: 0.53134	valid_1's l1: 0.619182
[1500]	training's l1: 0.531063	valid_1's l1: 0.618972
[1600]	training's l1: 0.530847	valid_1's l1: 0.618798
[1700]	training's l1: 0.530709	valid_1's l1: 0.618772
[1800]	training's l1: 0.530287	val

[32m[I 2021-07-14 02:13:33,293][0m Trial 13 finished with value: -0.6158202647043306 and parameters: {'max_depth': 7, 'min_child_weight': 20, 'subsample': 0.9, 'colsample_bytree': 0.7, 'reg_lambda': 0.022462361137322865, 'reg_alpha': 2.6255301153084414, 'feature_fraction': 0.8023180419033457, 'bagging_fraction': 0.8520203461668931, 'bagging_freq': 10}. Best is trial 13 with value: -0.6158202647043306.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.559574	valid_1's l1: 0.649486
[200]	training's l1: 0.554049	valid_1's l1: 0.642045
[300]	training's l1: 0.550613	valid_1's l1: 0.63783
[400]	training's l1: 0.547462	valid_1's l1: 0.633896
[500]	training's l1: 0.545021	valid_1's l1: 0.630737
[600]	training's l1: 0.543693	valid_1's l1: 0.629928
[700]	training's l1: 0.541408	valid_1's l1: 0.627575
[800]	training's l1: 0.540928	valid_1's l1: 0.627235
[900]	training's l1: 0.540633	valid_1's l1: 0.626761
[1000]	training's l1: 0.539021	valid_1's l1: 0.624737
[1100]	training's l1: 0.537542	valid_1's l1: 0.623943
[1200]	training's l1: 0.536195	valid_1's l1: 0.623347
[1300]	training's l1: 0.535917	valid_1's l1: 0.623155
[1400]	training's l1: 0.535384	valid_1's l1: 0.62272
[1500]	training's l1: 0.534676	valid_1's l1: 0.621932
[1600]	training's l1: 0.534408	valid_1's l1: 0.621745
[1700]	training's l1: 0.534024	valid_1's l1: 0.621411
[1800]	training's l1: 0.53382	v

[32m[I 2021-07-14 02:16:44,091][0m Trial 14 finished with value: -0.6193693381901162 and parameters: {'max_depth': 8, 'min_child_weight': 13, 'subsample': 0.9, 'colsample_bytree': 0.6, 'reg_lambda': 0.019801835208016566, 'reg_alpha': 0.8455362994412494, 'feature_fraction': 0.7349683093755618, 'bagging_fraction': 0.8284793328132405, 'bagging_freq': 10}. Best is trial 13 with value: -0.6158202647043306.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.552185	valid_1's l1: 0.642862
[200]	training's l1: 0.546359	valid_1's l1: 0.635546
[300]	training's l1: 0.543493	valid_1's l1: 0.632574
[400]	training's l1: 0.541687	valid_1's l1: 0.630246


In [24]:
print(f'best_score = {-study1.best_value}')
study1.best_trial.params

best_score = 0.6101065358576837


{'max_depth': 14,
 'min_child_weight': 6,
 'subsample': 0.6,
 'colsample_bytree': 0.9,
 'reg_lambda': 0.0018838140538476684,
 'reg_alpha': 0.08556914175201971,
 'feature_fraction': 0.4986705214389158,
 'bagging_fraction': 0.9999233378452145,
 'bagging_freq': 3}

In [25]:
print(f'best_score = {-study2.best_value}')
study2.best_trial.params

best_score = 1.1102271090816225


{'max_depth': 12,
 'min_child_weight': 17,
 'subsample': 0.9,
 'colsample_bytree': 0.6,
 'reg_lambda': 4.743867972743305,
 'reg_alpha': 5.017984477063532,
 'feature_fraction': 0.6654500771944255,
 'bagging_fraction': 0.8327191872719868,
 'bagging_freq': 3}

In [26]:
print(f'best_score = {-study3.best_value}')
study3.best_trial.params

best_score = 0.4839488001560449


{'max_depth': 8,
 'min_child_weight': 15,
 'subsample': 0.6,
 'colsample_bytree': 0.5,
 'reg_lambda': 1.630635008220753,
 'reg_alpha': 0.35226775968235935,
 'feature_fraction': 0.8501148986355881,
 'bagging_fraction': 0.7412745592610628,
 'bagging_freq': 10}

In [27]:
print(f'best_score = {-study4.best_value}')
study4.best_trial.params

best_score = 0.9808755908476768


{'max_depth': 6,
 'min_child_weight': 16,
 'subsample': 0.7,
 'colsample_bytree': 0.9,
 'reg_lambda': 180.23191595306406,
 'reg_alpha': 3.7328095545719884,
 'feature_fraction': 0.8956959834948048,
 'bagging_fraction': 0.7762124521203693,
 'bagging_freq': 2}