In [1]:
import gc
import os
import sys
import math
import random
import warnings
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)
import optuna
from tqdm import tqdm
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# from pandarallel import pandarallel
# pandarallel.initialize()
warnings.simplefilter("ignore")
import ctypes as ct
from datetime import timedelta

from sklearn.metrics import mean_absolute_error

import statistics as st
import lightgbm as lgbm
from scipy.stats import norm

In [2]:
sys.path.append('../../')
import src.utils as utils

## Param

In [3]:
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight']
rosters_cols = ['playerId', 'teamId', 'status', 'date']
transactions_cols = ['playerId', 'transaction_flag', 'date']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'date']

feature_cols1 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',
        "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
         "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt", 
        'daysSinceLastGame', 'season_info'] 

feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',
        "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
         "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt", 
        'daysSinceLastGame', 'season_info'] 

feature_cols3 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',
        "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
         "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt", 
        'daysSinceLastGame', 'season_info'] 

feature_cols4 = ['week_day', 'annual_day', 'month', 'label_playerId', 'label_primaryPositionName', 'label_teamId', 'label_birthCity',
                'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',
        "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
         "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt", 
        'daysSinceLastGame', 'season_info'] 

In [4]:
# training lightgbm
params1 = {'objective':'mae',
           'reg_alpha': 0.14947461820098767, 
           'reg_lambda': 0.10185644384043743, 
           'n_estimators': 3633, 
           'learning_rate': 0.08046301304430488, 
           'num_leaves': 674, 
           'feature_fraction': 0.9101240539122566, 
           'bagging_fraction': 0.9884451442950513, 
           'bagging_freq': 8, 
           'min_child_samples': 51}

params2 = {'objective':'mae',
           'reg_alpha': 0.1,
           'reg_lambda': 0.1, 
           'n_estimators': 80,
           'learning_rate': 0.1,
           'random_state': 42,
           "num_leaves": 22}

params3 = {'objective':'mae',
           'reg_alpha': 0.1,
           'reg_lambda': 0.1, 
           'n_estimators': 10000,
           'learning_rate': 0.1,
           'random_state': 42,
           "num_leaves": 100}

params4 = {'objective':'mae',
           'reg_alpha': 0.016468100279441976, 
           'reg_lambda': 0.09128335764019105, 
           'n_estimators': 9868, 
           'learning_rate': 0.10528150510326864, 
           'num_leaves': 157, 
           'feature_fraction': 0.5419185713426886, 
           'bagging_fraction': 0.2637405128936662, 
           'bagging_freq': 19, 
           'min_child_samples': 71}

In [5]:
EXP_NUM = 24
NFOLDS = 5
SEED = 42

In [6]:
utils.set_seed(SEED)

## Dir

In [7]:
DATA_DIR = Path("/home/knikaido/work/MLB-Player-Digital-Engagement-Forecasting/data/")
MAIN_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting'
TRAIN_DIR = MAIN_DATA_DIR / 'train'
OUTPUT_DIR = Path('./output/')

In [8]:
players = pd.read_csv(MAIN_DATA_DIR / 'players.csv')

rosters = pd.read_csv(TRAIN_DIR / 'rosters_train.csv')
targets = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv')
scores = pd.read_csv(TRAIN_DIR / 'playerBoxScores_train.csv')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()
seasons = pd.read_csv(MAIN_DATA_DIR / 'seasons.csv')

# events = pd.read_csv(TRAIN_DIR / 'events_train.csv')
# events = events.groupby(['gameDate']).sum().reset_index()


In [9]:
scores

Unnamed: 0,playerId,date,home,gamePk,teamId,jerseyNum,positionCode,battingOrder,gamesPlayedBatting,flyOuts,...,sacBuntsPitching,sacFliesPitching,saves,holds,blownSaves,assists,putOuts,errors,chances,index
0,112526,20180402,0,529469,140,40.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,91
1,112526,20180408,1,529546,140,40.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,97
2,112526,20180410,1,529565,140,40.0,1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,99
3,112526,20180415,0,529640,140,40.0,1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,104
4,112526,20180421,1,529718,140,40.0,1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182945,685503,20210409,1,634478,140,35.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1194
182946,685503,20210414,0,634496,140,35.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1199
182947,685503,20210419,0,634536,140,35.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1204
182948,685503,20210425,0,634393,140,35.0,1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1210


In [10]:
seasons = seasons.fillna('0000-00-00')
for c_ in seasons.columns[1:]:
    seasons[c_] = seasons[c_].str.replace('-', '').astype(int)

In [11]:
seasons

Unnamed: 0,seasonId,seasonStartDate,seasonEndDate,preSeasonStartDate,preSeasonEndDate,regularSeasonStartDate,regularSeasonEndDate,lastDate1stHalf,allStarDate,firstDate2ndHalf,postSeasonStartDate,postSeasonEndDate
0,2017,20170402,20171101,20170222,20170401,20170402,20171001,20170709,20170711,20170714,20171003,20171101
1,2018,20180329,20181028,20180221,20180327,20180329,20181001,20180715,20180717,20180719,20181002,20181028
2,2019,20190320,20191030,20190221,20190326,20190320,20190929,20190707,20190709,20190711,20191001,20191030
3,2020,20200723,20201028,20200221,20200722,20200723,20200927,20200825,0,20200826,20200929,20201028
4,2021,20210228,20211031,20210228,20210330,20210401,20211003,20210711,20210713,20210715,20211004,20211031


In [12]:
targets_train = targets[(targets['date'] >= 20210401)]

In [13]:
targets_train

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,index,date
2444346,2021-04-02,624415,2.810640,3.998462,0.136137,10.012438,1186,20210401
2444347,2021-04-02,656548,0.000000,0.153787,0.068889,0.062189,1186,20210401
2444348,2021-04-02,400284,0.000971,0.010985,0.021323,3.182007,1186,20210401
2444349,2021-04-02,606157,0.002736,1.642226,0.021323,0.549337,1186,20210401
2444350,2021-04-02,665620,0.020473,0.411929,0.549469,0.435323,1186,20210401
...,...,...,...,...,...,...,...,...
2506171,2021-05-01,451661,0.000000,0.013314,0.000000,0.625925,1215,20210430
2506172,2021-05-01,519301,0.000131,0.003329,0.000000,0.216229,1215,20210430
2506173,2021-05-01,527055,0.000000,0.019971,0.000000,0.273131,1215,20210430
2506174,2021-05-01,543484,0.000131,0.056586,0.000000,1.024240,1215,20210430


In [14]:
playerId_list = targets_train['playerId'].unique()

In [15]:
def calc_corr(df):
    # 相関係数行列を作成
    corr_mat = df.corr(method='pearson')

    # 行（列）サイズを取得
    n = corr_mat.shape[0]
    corr_ary = []

    for i in range(n):
        for j in range(i):
            if i == j:
                continue
            corr_ary.append(corr_mat.iloc[i,j])

    return corr_ary

In [16]:
def calc_probs(pid,df,temp):
    to_append=[pid,'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','']
    targets=['target1','target2','target3','target4']
    z=1
    for target in targets:
        target_prob = temp[target].tolist()
        mean = np.mean(target_prob)
        std = np.std(target_prob)
        median = st.median(target_prob)
        distribution = norm(mean, std)
        min_weight = min(target_prob)
        max_weight = max(target_prob)
        values = list(np.linspace(min_weight, max_weight))
        probabilities = [distribution.pdf(v) for v in values]
        max_value = max(probabilities)
        max_index = probabilities.index(max_value)
        to_append[z]=mean
        to_append[z+1]=median
        to_append[z+2]=std
        to_append[z+3]=min_weight
        to_append[z+4]=max_weight
        to_append[z+5]=target_prob[max_index]
        to_append[z+6]=temp[target].skew()
        to_append[z+7]=temp[target].kurt()

        z=z+8
    corr_ = calc_corr(temp[['target1', 'target2', 'target3', 'target4']])
    to_append[z:] = corr_  
    df_length = len(df)
    df.loc[df_length] = to_append
    return df
    

### CREATE DATAFRAME to store probabilities
column_names = ["playerId", "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_prob","target1_skew","target1_kurt",
                "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_prob","target2_skew","target2_kurt",
                "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_prob","target3_skew","target3_kurt",
                "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_prob","target4_skew","target4_kurt",
                'tgt1_2_corr', 'tgt1_3_corr', 'tgt2_3_corr', 'tgt1_4_corr', 'tgt2_4_corr', 'tgt3_4_corr']
player_target_probs = pd.DataFrame(columns = column_names)
    
for pid in tqdm(playerId_list):
    temp = targets_train[targets_train['playerId'] == pid]
    player_target_stats=calc_probs(pid,player_target_probs,temp)

100%|██████████| 2061/2061 [00:45<00:00, 44.96it/s]


In [17]:
player_target_stats

Unnamed: 0,playerId,target1_mean,target1_median,target1_std,target1_min,target1_max,target1_prob,target1_skew,target1_kurt,target2_mean,...,target4_max,target4_prob,target4_skew,target4_kurt,tgt1_2_corr,tgt1_3_corr,tgt2_3_corr,tgt1_4_corr,tgt2_4_corr,tgt3_4_corr
0,624415.0,1.193031,0.489252,1.578499,0.008327,7.267988,0.785380,2.208850,6.071897,5.398585,...,26.025186,10.464387,1.695464,3.831404,0.408492,-0.009458,-0.101143,0.227373,0.694767,-0.039737
1,656548.0,0.110672,0.001015,0.428984,0.000000,2.392265,0.000000,5.272732,28.385845,0.445853,...,0.423573,0.032978,1.294196,1.140084,0.566411,0.588373,0.527886,0.231260,0.525065,0.132559
2,400284.0,0.001807,0.001301,0.001164,0.000437,0.004770,0.001127,1.155729,0.582752,0.073071,...,8.909871,7.882136,0.388255,-0.703079,0.102451,0.429495,0.046450,0.006606,0.277479,-0.016683
3,606157.0,0.748247,0.024461,1.318711,0.002736,4.840508,0.548613,2.003263,3.235374,2.027612,...,3.931330,0.231750,2.053818,4.653597,0.641382,0.093253,-0.002014,0.404372,0.626199,0.249109
4,665620.0,0.483331,0.020388,1.415708,0.004932,6.322554,0.128698,3.507992,11.893073,1.913634,...,8.474798,0.314951,4.910653,25.637658,0.807310,0.810125,0.965737,0.778467,0.954381,0.959152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2056,621350.0,0.000020,0.000000,0.000051,0.000000,0.000175,0.000175,2.331868,3.822563,0.007298,...,0.206009,0.017662,0.992179,0.627281,0.278321,,,-0.041837,0.060094,
2057,665759.0,0.000095,0.000000,0.000174,0.000000,0.000701,0.000000,2.033176,3.899046,0.238824,...,0.332631,0.019864,2.366135,7.230272,0.676003,,,0.678376,0.848313,
2058,448178.0,0.000020,0.000000,0.000052,0.000000,0.000174,0.000000,2.345681,3.902834,0.005605,...,0.350299,0.188265,2.416382,8.149779,-0.053275,-0.134373,-0.260662,-0.106479,-0.172380,-0.078929
2059,488681.0,0.000074,0.000000,0.000142,0.000000,0.000506,0.000000,2.153297,3.865929,0.012644,...,0.453328,0.075379,1.055577,1.166366,-0.114731,0.364257,0.095866,0.216720,0.301329,0.054098


In [18]:
data_names=player_target_stats.columns.values.tolist()

In [19]:
players['DOY'] = pd.to_datetime(players['DOB'], format="%Y-%m-%d").dt.year
players['mlbDebutYear'] = pd.to_datetime(players['mlbDebutDate'], format="%Y-%m-%d").dt.year
players['DebutAge'] = players['mlbDebutYear'] - players['DOY']

In [20]:
# creat dataset
train = targets[targets_cols].merge(players[players_cols], on=['playerId'], how='left')
train = train.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')
train = train.merge(scores[scores_cols], on=['playerId', 'date'], how='left')
train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])


In [21]:
# label encoding
player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
birthCityn2num = {c: i for i, c in enumerate(train['birthCity'].unique())}
teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
status2num = {c: i for i, c in enumerate(train['status'].unique())}
train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['label_birthCity'] = train['birthCity'].map(birthCityn2num)
train['label_teamId'] = train['teamId'].map(teamid2num)
train['label_status'] = train['status'].map(status2num)

In [22]:
date_ = pd.to_datetime(train['date'], format="%Y%m%d")
train['annual_day'] = (date_ - pd.to_datetime(date_.dt.year, format="%Y")) /  timedelta(days=1)
train['week_day'] = date_.dt.weekday
train['month'] = date_.dt.month

In [23]:
train['gameday'] = ~train['battingOrder'].isna()*1

In [24]:
train.sort_values(by=['playerId','date'],inplace=True,ascending=True)

In [25]:
def count_consecutive_items_n_cols(df, col_name_list, output_col):
    cum_sum_list = [
        (df[col_name] != df[col_name].shift(1)).cumsum().tolist() for col_name in col_name_list
    ]
    df[output_col] = df.groupby(
        ["_".join(map(str, x)) for x in zip(*cum_sum_list)]
    ).cumcount() + 1
    return df

In [26]:
train=count_consecutive_items_n_cols(train,['playerId','gameday'],'daysSinceLastGame')
train.loc[train['gameday']==1,'daysSinceLastGame']=0

In [27]:
train_game = train[train['gameday']==1]
train_last_game = train_game[~train_game.duplicated(subset='playerId', keep='last')][['playerId', 'date']]
train_last_game.columns = ['playerId', 'lastdate']
train_player_unique = pd.DataFrame(train['playerId'].unique(), columns=['playerId'])
train_last_game = pd.merge(train_player_unique, train_last_game, on=['playerId'], how='left' )
train_last_game = train_last_game.fillna(20171231)

In [28]:
train_last_game.sort_values('lastdate')

Unnamed: 0,playerId,lastdate
856,593590,20171231.0
1995,670462,20171231.0
2000,670764,20171231.0
1765,661269,20171231.0
1932,667674,20171231.0
...,...,...
473,542932,20210430.0
1480,641856,20210430.0
1481,641857,20210430.0
483,543037,20210430.0


In [29]:
def extract_season(date_raw, season_start_end):
    idxes = 0
    for raw in season_start_end.iloc():
        idx_ = ((date_raw >= raw.iloc[0]) & (date_raw <= raw.iloc[1])) * 1
        idxes += idx_
    return idxes

In [30]:
## special_daysに2たてんのあやしいけど，軽く見た感じ全部シーズン中っぽいからいいかなと．
on_preseason_idxes = extract_season(train['date'], seasons[['preSeasonStartDate', 'preSeasonEndDate']])
on_season_idxes = extract_season(train['date'], seasons[['regularSeasonStartDate', 'regularSeasonEndDate']]) * 2
on_postseason_idxes = extract_season(train['date'], seasons[['postSeasonStartDate', 'postSeasonEndDate']]) * 3

special_days = seasons['lastDate1stHalf'].to_list() + seasons['allStarDate'].to_list() + seasons['firstDate2ndHalf'].to_list()
special_idxes = 0
for day in special_days:
    special_idxes += (train['date'] == day) * 2
on_total_season_idxes = on_preseason_idxes + on_season_idxes + on_postseason_idxes + special_idxes

train['season_info'] = on_total_season_idxes

In [31]:
## only on season
on_whole_idxes = extract_season(train['date'], seasons[['seasonStartDate', 'seasonEndDate']])
train = train[on_whole_idxes == 1].reset_index(drop=True)

In [32]:
train

Unnamed: 0,playerId,target1,target2,target3,target4,date,primaryPositionName,birthCity,DOY,mlbDebutYear,...,label_primaryPositionName,label_birthCity,label_teamId,label_status,annual_day,week_day,month,gameday,daysSinceLastGame,season_info
0,112526,0.031761,2.731418,0.388556,6.349412,20180329,Pitcher,Altamira,1973,1997.0,...,0,588,1,3,87.0,3,3,0,88,2
1,112526,0.025906,4.622162,0.408017,11.508375,20180330,Pitcher,Altamira,1973,1997.0,...,0,588,1,3,88.0,4,3,0,89,2
2,112526,0.053185,4.767842,0.275408,14.600851,20180331,Pitcher,Altamira,1973,1997.0,...,0,588,1,3,89.0,5,3,0,90,2
3,112526,0.771100,63.601677,7.566316,100.000000,20180401,Pitcher,Altamira,1973,1997.0,...,0,588,1,3,90.0,6,4,0,91,2
4,112526,5.957846,22.427930,33.900803,38.857939,20180402,Pitcher,Altamira,1973,1997.0,...,0,588,7,0,91.0,0,4,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1234534,685503,0.044617,1.224728,0.009437,0.737463,20210426,Pitcher,Hiroshima,1992,2021.0,...,0,902,7,0,115.0,0,4,0,1,2
1234535,685503,0.019123,1.178880,0.013161,0.790301,20210427,Pitcher,Hiroshima,1992,2021.0,...,0,902,7,0,116.0,1,4,0,2,2
1234536,685503,0.015799,4.323489,0.002350,0.970273,20210428,Pitcher,Hiroshima,1992,2021.0,...,0,902,7,0,117.0,2,4,0,3,2
1234537,685503,0.018770,31.946021,0.305491,5.938273,20210429,Pitcher,Hiroshima,1992,2021.0,...,0,902,7,0,118.0,3,4,0,4,2


In [33]:
# def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
#     oof_pred = np.zeros(len(y_valid), dtype=np.float32)
#     model = lgbm.LGBMRegressor(**params)
#     model.fit(x_train, y_train, 
#         eval_set=[(X_train, y_train), (x_valid, y_valid)],  
#         early_stopping_rounds=verbose, 
#         verbose=verbose)
#     oof_pred = model.predict(x_valid)
#     oof_pred = np.clip(oof_pred, 0, 100)
#     score = mean_absolute_error(oof_pred, y_valid)
#     print('mae:', score)
#     return oof_pred, model, score


In [34]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, verbose=100):
    def opt(trial):
        params = {
                'random_state': SEED,
                'objective':'mae',
                'n_estimators': 10000,
                'learning_rate': 0.1,
                'max_depth': trial.suggest_int('max_depth', 1, 20),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
                'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
                'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1),
                'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 1e3),
                'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 1e3),
                'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
                'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
                'bagging_freq': trial.suggest_int('bagging_freq', 1, 10)
        }

        model_opt = lgbm.LGBMRegressor(**params)

        model_opt.fit(x_train, y_train, 
            eval_set=[(X_train, y_train), (x_valid, y_valid)],  
            early_stopping_rounds=verbose, 
            verbose=verbose)
        oof_pred = model_opt.predict(x_valid)
        oof_pred = np.clip(oof_pred, 0, 100)
        score = mean_absolute_error(oof_pred, y_valid)
        return -score
    return opt

In [35]:
train_X = train
train_y = train[['target1', 'target2', 'target3', 'target4']]
tr_idxs = []
val_idxs = []

In [36]:
# tr_idx = (train['date'].astype(int) < 20200801)
# val_idx = (train['date'].astype(int) >= 20200801) & (train['date'].astype(int) < 20200901)
# tr_idxs.append(tr_idx)
# val_idxs.append(val_idx)

# tr_idx = (train['date'].astype(int) < 20200901)
# val_idx = (train['date'].astype(int) >= 20200901) & (train['date'].astype(int) < 20201001)
# tr_idxs.append(tr_idx)
# val_idxs.append(val_idx)

# tr_idx = (train['date'].astype(int) < 20201001)
# val_idx = (train['date'].astype(int) >= 20201001) & (train['date'].astype(int) < 20201028)
# tr_idxs.append(tr_idx)
# val_idxs.append(val_idx)

# tr_idx = (train['date'].astype(int) < 20210228)
# val_idx = (train['date'].astype(int) >= 20210228) & (train['date'].astype(int) < 20210401)
# tr_idxs.append(tr_idx)
# val_idxs.append(val_idx)

tr_idx = (train['date'].astype(int) < 20210401)
val_idx = ~tr_idx
tr_idxs.append(tr_idx)
val_idxs.append(val_idx)

In [37]:
idx = 0

tr_idx = tr_idxs[idx]
val_idx = val_idxs[idx]

x_train = train_X.loc[tr_idx].reset_index(drop=True)
y_train = train_y.loc[tr_idx].reset_index(drop=True)
x_valid = train_X.loc[val_idx].reset_index(drop=True)
y_valid = train_y.loc[val_idx].reset_index(drop=True)

In [None]:
study1 = optuna.create_study(direction='maximize')
study1.optimize(fit_lgbm(x_train[feature_cols1], y_train['target1'], x_valid[feature_cols1], y_valid['target1']), n_trials=100)

print('Number of finished trials:', len(study1.trials))
print('Best trial:', study1.best_trial.params)

[32m[I 2021-07-02 02:06:04,433][0m A new study created in memory with name: no-name-d0e5dada-999a-4e87-b1ea-09c1254270e7[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.716239
[200]	valid_0's l1: 0.714169
[300]	valid_0's l1: 0.712504
[400]	valid_0's l1: 0.710493
[500]	valid_0's l1: 0.707794
[600]	valid_0's l1: 0.70778
[700]	valid_0's l1: 0.704761
[800]	valid_0's l1: 0.702415
[900]	valid_0's l1: 0.701296
[1000]	valid_0's l1: 0.698081
[1100]	valid_0's l1: 0.693775
[1200]	valid_0's l1: 0.688719
[1300]	valid_0's l1: 0.687582
[1400]	valid_0's l1: 0.686349
[1500]	valid_0's l1: 0.686124
[1600]	valid_0's l1: 0.685821
[1700]	valid_0's l1: 0.683795
[1800]	valid_0's l1: 0.681825
[1900]	valid_0's l1: 0.681295
[2000]	valid_0's l1: 0.679707
[2100]	valid_0's l1: 0.677527
[2200]	valid_0's l1: 0.675775
[2300]	valid_0's l1: 0.673572
[2400]	valid_0's l1: 0.672637
[2500]	valid_0's l1: 0.67153
[2600]	valid_0's l1: 0.671553
[2700]	valid_0's l1: 0.670683
[2800]	valid_0's l1: 0.670045
[2900]	valid_0's l1: 0.669631
[3000]	valid_0's l1: 0.669481
[3100]	valid_0's l1: 0.669117
[3200]	valid_0's l1

[32m[I 2021-07-02 02:08:27,648][0m Trial 0 finished with value: -0.6643726270378127 and parameters: {'max_depth': 16, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.9, 'reg_lambda': 0.18721155679236948, 'reg_alpha': 6.760665957617438, 'feature_fraction': 0.4728386646482762, 'bagging_fraction': 0.6903464369843542, 'bagging_freq': 2}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.713632
[200]	valid_0's l1: 0.710933
[300]	valid_0's l1: 0.710449
[400]	valid_0's l1: 0.709768
[500]	valid_0's l1: 0.706776
[600]	valid_0's l1: 0.705969
[700]	valid_0's l1: 0.705712
[800]	valid_0's l1: 0.703325
[900]	valid_0's l1: 0.699664
[1000]	valid_0's l1: 0.699353
[1100]	valid_0's l1: 0.696771
[1200]	valid_0's l1: 0.694589
[1300]	valid_0's l1: 0.690713
[1400]	valid_0's l1: 0.687487
[1500]	valid_0's l1: 0.687048
[1600]	valid_0's l1: 0.686564
[1700]	valid_0's l1: 0.683787
[1800]	valid_0's l1: 0.68225
[1900]	valid_0's l1: 0.680813
[2000]	valid_0's l1: 0.678795
[2100]	valid_0's l1: 0.678146
[2200]	valid_0's l1: 0.67697
[2300]	valid_0's l1: 0.675825
[2400]	valid_0's l1: 0.67501
[2500]	valid_0's l1: 0.672246
[2600]	valid_0's l1: 0.671557
[2700]	valid_0's l1: 0.671395
[2800]	valid_0's l1: 0.670826
[2900]	valid_0's l1: 0.670418
[3000]	valid_0's l1: 0.669893
[3100]	valid_0's l1: 0.669734
Early stopping, best

[32m[I 2021-07-02 02:09:55,712][0m Trial 1 finished with value: -0.6695878958797954 and parameters: {'max_depth': 16, 'min_child_weight': 18, 'subsample': 0.7, 'colsample_bytree': 0.5, 'reg_lambda': 67.38258051750081, 'reg_alpha': 0.5877315572347379, 'feature_fraction': 0.6325413100609413, 'bagging_fraction': 0.453509870573666, 'bagging_freq': 4}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.712495
[200]	valid_0's l1: 0.71139
[300]	valid_0's l1: 0.709912
[400]	valid_0's l1: 0.707888
[500]	valid_0's l1: 0.706045
[600]	valid_0's l1: 0.703453
[700]	valid_0's l1: 0.701834
[800]	valid_0's l1: 0.697411
Early stopping, best iteration is:
[775]	valid_0's l1: 0.697406


[32m[I 2021-07-02 02:10:33,877][0m Trial 2 finished with value: -0.6973891131182498 and parameters: {'max_depth': 17, 'min_child_weight': 18, 'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_lambda': 25.9303597447385, 'reg_alpha': 0.3738614738130248, 'feature_fraction': 0.5311469966197495, 'bagging_fraction': 0.8409681847808443, 'bagging_freq': 5}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.714802
[200]	valid_0's l1: 0.71394
[300]	valid_0's l1: 0.712765
[400]	valid_0's l1: 0.712704
[500]	valid_0's l1: 0.710115
[600]	valid_0's l1: 0.710071
[700]	valid_0's l1: 0.709728
[800]	valid_0's l1: 0.708782
[900]	valid_0's l1: 0.708663
Early stopping, best iteration is:
[861]	valid_0's l1: 0.70865


[32m[I 2021-07-02 02:11:17,839][0m Trial 3 finished with value: -0.7086303845809134 and parameters: {'max_depth': 14, 'min_child_weight': 2, 'subsample': 0.5, 'colsample_bytree': 0.5, 'reg_lambda': 0.025744666668719895, 'reg_alpha': 392.0557868702861, 'feature_fraction': 0.6518683020567371, 'bagging_fraction': 0.4415334713130477, 'bagging_freq': 1}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.800625
[200]	valid_0's l1: 0.800425
[300]	valid_0's l1: 0.800425


[32m[I 2021-07-02 02:11:27,228][0m Trial 4 finished with value: -0.8004251356571601 and parameters: {'max_depth': 1, 'min_child_weight': 4, 'subsample': 0.9, 'colsample_bytree': 0.8, 'reg_lambda': 2.9992022605716544, 'reg_alpha': 0.6051910599174318, 'feature_fraction': 0.9978124475997151, 'bagging_fraction': 0.8310551887531437, 'bagging_freq': 6}. Best is trial 0 with value: -0.6643726270378127.[0m


Early stopping, best iteration is:
[227]	valid_0's l1: 0.800425
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.717678
[200]	valid_0's l1: 0.717138
[300]	valid_0's l1: 0.717136
[400]	valid_0's l1: 0.716215
[500]	valid_0's l1: 0.715375
[600]	valid_0's l1: 0.71226
[700]	valid_0's l1: 0.712074
[800]	valid_0's l1: 0.708757
[900]	valid_0's l1: 0.707882
[1000]	valid_0's l1: 0.707883
Early stopping, best iteration is:
[902]	valid_0's l1: 0.707882


[32m[I 2021-07-02 02:12:01,532][0m Trial 5 finished with value: -0.7078543771892414 and parameters: {'max_depth': 12, 'min_child_weight': 9, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 0.06190181438808629, 'reg_alpha': 466.50028733948915, 'feature_fraction': 0.8889874382789595, 'bagging_fraction': 0.7750549497544781, 'bagging_freq': 7}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.71501
[200]	valid_0's l1: 0.71321
[300]	valid_0's l1: 0.709871
[400]	valid_0's l1: 0.70532
[500]	valid_0's l1: 0.702955
[600]	valid_0's l1: 0.702092
[700]	valid_0's l1: 0.7006
[800]	valid_0's l1: 0.696848
[900]	valid_0's l1: 0.696851
[1000]	valid_0's l1: 0.696328
[1100]	valid_0's l1: 0.695961
[1200]	valid_0's l1: 0.693658
[1300]	valid_0's l1: 0.690498
[1400]	valid_0's l1: 0.687805
[1500]	valid_0's l1: 0.68597
[1600]	valid_0's l1: 0.685155
[1700]	valid_0's l1: 0.684662
[1800]	valid_0's l1: 0.683005
[1900]	valid_0's l1: 0.681447
[2000]	valid_0's l1: 0.68076
[2100]	valid_0's l1: 0.679097
[2200]	valid_0's l1: 0.675783
[2300]	valid_0's l1: 0.67545
Early stopping, best iteration is:
[2243]	valid_0's l1: 0.675444


[32m[I 2021-07-02 02:13:07,474][0m Trial 6 finished with value: -0.6753154656432027 and parameters: {'max_depth': 16, 'min_child_weight': 6, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_lambda': 0.0959688476136587, 'reg_alpha': 19.013543546942625, 'feature_fraction': 0.4344977070045717, 'bagging_fraction': 0.5721527355576272, 'bagging_freq': 9}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.714229
[200]	valid_0's l1: 0.711221


[32m[I 2021-07-02 02:13:18,256][0m Trial 7 finished with value: -0.7112153044026099 and parameters: {'max_depth': 12, 'min_child_weight': 18, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 0.0030975558001931688, 'reg_alpha': 0.0026088651550572927, 'feature_fraction': 0.6214811069749684, 'bagging_fraction': 0.7276914828141826, 'bagging_freq': 5}. Best is trial 0 with value: -0.6643726270378127.[0m


Early stopping, best iteration is:
[146]	valid_0's l1: 0.711219
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.719631
[200]	valid_0's l1: 0.716579
[300]	valid_0's l1: 0.716575
[400]	valid_0's l1: 0.716523
[500]	valid_0's l1: 0.716521
[600]	valid_0's l1: 0.715664
[700]	valid_0's l1: 0.715639
[800]	valid_0's l1: 0.715586
[900]	valid_0's l1: 0.715582
[1000]	valid_0's l1: 0.714677
[1100]	valid_0's l1: 0.713605
[1200]	valid_0's l1: 0.713574
[1300]	valid_0's l1: 0.713014
[1400]	valid_0's l1: 0.712759
[1500]	valid_0's l1: 0.711171
[1600]	valid_0's l1: 0.710927
[1700]	valid_0's l1: 0.710306
[1800]	valid_0's l1: 0.70945
[1900]	valid_0's l1: 0.707454
[2000]	valid_0's l1: 0.703829
[2100]	valid_0's l1: 0.70322
[2200]	valid_0's l1: 0.702496
[2300]	valid_0's l1: 0.701335
[2400]	valid_0's l1: 0.701266
[2500]	valid_0's l1: 0.699445
[2600]	valid_0's l1: 0.698799
[2700]	valid_0's l1: 0.698614
[2800]	valid_0's l1: 0.696716
[2900]	valid_0's l1: 0.696688
[3000]	valid_0'

[32m[I 2021-07-02 02:14:43,468][0m Trial 8 finished with value: -0.6901005227659539 and parameters: {'max_depth': 14, 'min_child_weight': 9, 'subsample': 0.5, 'colsample_bytree': 0.5, 'reg_lambda': 221.48276892915197, 'reg_alpha': 670.4265992599105, 'feature_fraction': 0.9180738622718437, 'bagging_fraction': 0.5166636416469121, 'bagging_freq': 1}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.71383
[200]	valid_0's l1: 0.712925
[300]	valid_0's l1: 0.710635
[400]	valid_0's l1: 0.708674
[500]	valid_0's l1: 0.704316
[600]	valid_0's l1: 0.702584
[700]	valid_0's l1: 0.702486
[800]	valid_0's l1: 0.702437
[900]	valid_0's l1: 0.700855
[1000]	valid_0's l1: 0.69902
[1100]	valid_0's l1: 0.695683
[1200]	valid_0's l1: 0.695213
[1300]	valid_0's l1: 0.692193
[1400]	valid_0's l1: 0.689103
[1500]	valid_0's l1: 0.689104
Early stopping, best iteration is:
[1402]	valid_0's l1: 0.689051


[32m[I 2021-07-02 02:15:35,995][0m Trial 9 finished with value: -0.6890095955669474 and parameters: {'max_depth': 11, 'min_child_weight': 9, 'subsample': 0.6, 'colsample_bytree': 0.5, 'reg_lambda': 0.1258837791715338, 'reg_alpha': 69.69365926605171, 'feature_fraction': 0.5202385892944571, 'bagging_fraction': 0.6871621408414655, 'bagging_freq': 8}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.699288
[200]	valid_0's l1: 0.698219
[300]	valid_0's l1: 0.697708
[400]	valid_0's l1: 0.696835
[500]	valid_0's l1: 0.694173
[600]	valid_0's l1: 0.691998
[700]	valid_0's l1: 0.690755
[800]	valid_0's l1: 0.68836
[900]	valid_0's l1: 0.68799
[1000]	valid_0's l1: 0.685772
[1100]	valid_0's l1: 0.682854
[1200]	valid_0's l1: 0.682416
[1300]	valid_0's l1: 0.681304
[1400]	valid_0's l1: 0.679428
[1500]	valid_0's l1: 0.678849
[1600]	valid_0's l1: 0.677781
[1700]	valid_0's l1: 0.676855
[1800]	valid_0's l1: 0.675941
[1900]	valid_0's l1: 0.674061
[2000]	valid_0's l1: 0.673539
[2100]	valid_0's l1: 0.672945
Early stopping, best iteration is:
[2016]	valid_0's l1: 0.672843


[32m[I 2021-07-02 02:17:14,190][0m Trial 10 finished with value: -0.6727365147016211 and parameters: {'max_depth': 6, 'min_child_weight': 14, 'subsample': 0.8, 'colsample_bytree': 0.9, 'reg_lambda': 2.9905701868192196, 'reg_alpha': 0.015139084077234459, 'feature_fraction': 0.40454891385018676, 'bagging_fraction': 0.9862582444257333, 'bagging_freq': 3}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.714572
[200]	valid_0's l1: 0.711938
[300]	valid_0's l1: 0.711673
[400]	valid_0's l1: 0.71074
[500]	valid_0's l1: 0.709838
[600]	valid_0's l1: 0.709718
[700]	valid_0's l1: 0.707826
[800]	valid_0's l1: 0.70692
[900]	valid_0's l1: 0.70457
[1000]	valid_0's l1: 0.702479
[1100]	valid_0's l1: 0.701681
[1200]	valid_0's l1: 0.701123
[1300]	valid_0's l1: 0.70107
[1400]	valid_0's l1: 0.700456
[1500]	valid_0's l1: 0.696164
[1600]	valid_0's l1: 0.695942
[1700]	valid_0's l1: 0.693858
[1800]	valid_0's l1: 0.691569
[1900]	valid_0's l1: 0.689941
[2000]	valid_0's l1: 0.687108
[2100]	valid_0's l1: 0.686118
[2200]	valid_0's l1: 0.684611
[2300]	valid_0's l1: 0.683038
[2400]	valid_0's l1: 0.68208
[2500]	valid_0's l1: 0.681769
[2600]	valid_0's l1: 0.681053
[2700]	valid_0's l1: 0.68008
[2800]	valid_0's l1: 0.678246
[2900]	valid_0's l1: 0.676769
[3000]	valid_0's l1: 0.676485
[3100]	valid_0's l1: 0.675951
Early stopping, best it

[32m[I 2021-07-02 02:18:59,661][0m Trial 11 finished with value: -0.6757464055015447 and parameters: {'max_depth': 20, 'min_child_weight': 14, 'subsample': 0.7, 'colsample_bytree': 0.6, 'reg_lambda': 429.30018236548045, 'reg_alpha': 5.208379627522624, 'feature_fraction': 0.7416171923018715, 'bagging_fraction': 0.602122723270735, 'bagging_freq': 3}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.715466
[200]	valid_0's l1: 0.710855


[32m[I 2021-07-02 02:19:09,213][0m Trial 12 finished with value: -0.7108548401447013 and parameters: {'max_depth': 20, 'min_child_weight': 1, 'subsample': 0.6, 'colsample_bytree': 0.6, 'reg_lambda': 24.432775484637766, 'reg_alpha': 0.05345824694675905, 'feature_fraction': 0.7853726077036347, 'bagging_fraction': 0.41541773021082545, 'bagging_freq': 3}. Best is trial 0 with value: -0.6643726270378127.[0m


Early stopping, best iteration is:
[197]	valid_0's l1: 0.710855
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.713167
[200]	valid_0's l1: 0.711845
[300]	valid_0's l1: 0.708322
[400]	valid_0's l1: 0.707881
[500]	valid_0's l1: 0.706436
[600]	valid_0's l1: 0.705162
[700]	valid_0's l1: 0.698546
[800]	valid_0's l1: 0.698541
[900]	valid_0's l1: 0.697304
[1000]	valid_0's l1: 0.694657
[1100]	valid_0's l1: 0.694056
[1200]	valid_0's l1: 0.692772
[1300]	valid_0's l1: 0.691049
[1400]	valid_0's l1: 0.686304
[1500]	valid_0's l1: 0.682768
[1600]	valid_0's l1: 0.681872
[1700]	valid_0's l1: 0.680342
[1800]	valid_0's l1: 0.678692
[1900]	valid_0's l1: 0.677153
[2000]	valid_0's l1: 0.676146
[2100]	valid_0's l1: 0.675388
[2200]	valid_0's l1: 0.674322
[2300]	valid_0's l1: 0.673259
[2400]	valid_0's l1: 0.67228
[2500]	valid_0's l1: 0.671708
[2600]	valid_0's l1: 0.671511
[2700]	valid_0's l1: 0.67054
[2800]	valid_0's l1: 0.669986
[2900]	valid_0's l1: 0.669683
[3000]	valid_0'

[32m[I 2021-07-02 02:21:55,656][0m Trial 13 finished with value: -0.6660498635474172 and parameters: {'max_depth': 18, 'min_child_weight': 20, 'subsample': 0.7, 'colsample_bytree': 0.7, 'reg_lambda': 0.5958250807678849, 'reg_alpha': 3.2103443661105553, 'feature_fraction': 0.5374469924180123, 'bagging_fraction': 0.9898641995299122, 'bagging_freq': 3}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.707273
[200]	valid_0's l1: 0.706467
[300]	valid_0's l1: 0.706461
[400]	valid_0's l1: 0.705805
[500]	valid_0's l1: 0.703224
[600]	valid_0's l1: 0.701832
[700]	valid_0's l1: 0.700926
[800]	valid_0's l1: 0.699839
[900]	valid_0's l1: 0.699737
[1000]	valid_0's l1: 0.699501
[1100]	valid_0's l1: 0.693472
[1200]	valid_0's l1: 0.691961
[1300]	valid_0's l1: 0.684424
[1400]	valid_0's l1: 0.683658
[1500]	valid_0's l1: 0.682172
[1600]	valid_0's l1: 0.680804
[1700]	valid_0's l1: 0.68065
[1800]	valid_0's l1: 0.680285
[1900]	valid_0's l1: 0.677212
[2000]	valid_0's l1: 0.677105
[2100]	valid_0's l1: 0.676157
[2200]	valid_0's l1: 0.674646
[2300]	valid_0's l1: 0.673611
[2400]	valid_0's l1: 0.673138
[2500]	valid_0's l1: 0.671957
[2600]	valid_0's l1: 0.671401
[2700]	valid_0's l1: 0.670737
[2800]	valid_0's l1: 0.670389
[2900]	valid_0's l1: 0.669844
[3000]	valid_0's l1: 0.669749
[3100]	valid_0's l1: 0.668494
[3200]	valid_0's l

[32m[I 2021-07-02 02:23:56,944][0m Trial 14 finished with value: -0.6678726850217395 and parameters: {'max_depth': 7, 'min_child_weight': 13, 'subsample': 0.8, 'colsample_bytree': 0.7, 'reg_lambda': 0.5965256880758975, 'reg_alpha': 4.199819762314512, 'feature_fraction': 0.493661165652356, 'bagging_fraction': 0.9964048634333307, 'bagging_freq': 1}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.718199
[200]	valid_0's l1: 0.715774


[32m[I 2021-07-02 02:24:13,909][0m Trial 15 finished with value: -0.7157712553179746 and parameters: {'max_depth': 19, 'min_child_weight': 6, 'subsample': 0.6, 'colsample_bytree': 0.7, 'reg_lambda': 0.005269000616778802, 'reg_alpha': 46.687786578582696, 'feature_fraction': 0.566281814117628, 'bagging_fraction': 0.9246090914721424, 'bagging_freq': 2}. Best is trial 0 with value: -0.6643726270378127.[0m


Early stopping, best iteration is:
[178]	valid_0's l1: 0.715773
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.712778
[200]	valid_0's l1: 0.711597
[300]	valid_0's l1: 0.711417
[400]	valid_0's l1: 0.710561
[500]	valid_0's l1: 0.709798
[600]	valid_0's l1: 0.709259
[700]	valid_0's l1: 0.706767
[800]	valid_0's l1: 0.704291
[900]	valid_0's l1: 0.701612
[1000]	valid_0's l1: 0.699987
[1100]	valid_0's l1: 0.696262
[1200]	valid_0's l1: 0.694552
[1300]	valid_0's l1: 0.692373
[1400]	valid_0's l1: 0.689629
[1500]	valid_0's l1: 0.689514
[1600]	valid_0's l1: 0.686411
[1700]	valid_0's l1: 0.686154
[1800]	valid_0's l1: 0.682852
[1900]	valid_0's l1: 0.681783
[2000]	valid_0's l1: 0.681263
[2100]	valid_0's l1: 0.676355
[2200]	valid_0's l1: 0.675696
[2300]	valid_0's l1: 0.675257
[2400]	valid_0's l1: 0.674805
[2500]	valid_0's l1: 0.67471
[2600]	valid_0's l1: 0.673315
[2700]	valid_0's l1: 0.672469
[2800]	valid_0's l1: 0.669545
[2900]	valid_0's l1: 0.669241
[3000]	valid_0

[32m[I 2021-07-02 02:26:33,682][0m Trial 16 finished with value: -0.6667532088208816 and parameters: {'max_depth': 18, 'min_child_weight': 5, 'subsample': 0.9, 'colsample_bytree': 0.6, 'reg_lambda': 0.4357458727049655, 'reg_alpha': 5.80706340255427, 'feature_fraction': 0.44652502730879295, 'bagging_fraction': 0.6663659649480198, 'bagging_freq': 2}. Best is trial 0 with value: -0.6643726270378127.[0m


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.710632
[200]	valid_0's l1: 0.710359
[300]	valid_0's l1: 0.705664


In [None]:
study2 = optuna.create_study(direction='maximize')
study2.optimize(fit_lgbm(x_train[feature_cols2], y_train['target2'], x_valid[feature_cols2], y_valid['target2']), n_trials=100)

print('Number of finished trials:', len(study2.trials))
print('Best trial:', study2.best_trial.params)

In [None]:
study3 = optuna.create_study(direction='maximize')
study3.optimize(fit_lgbm(x_train[feature_cols3], y_train['target3'], x_valid[feature_cols3], y_valid['target3']), n_trials=100)

print('Number of finished trials:', len(study3.trials))
print('Best trial:', study3.best_trial.params)

In [None]:
study4 = optuna.create_study(direction='maximize')
study4.optimize(fit_lgbm(x_train[feature_cols4], y_train['target4'], x_valid[feature_cols4], y_valid['target4']), n_trials=100)

print('Number of finished trials:', len(study4.trials))
print('Best trial:', study4.best_trial.params)