In [1]:
import gc
import os
import sys
import math
import random
import warnings
import pickle
from pathlib import Path
from joblib import Parallel, delayed
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)
import optuna
from tqdm import tqdm
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# from pandarallel import pandarallel
# pandarallel.initialize()
warnings.simplefilter("ignore")
import ctypes as ct
from datetime import timedelta

from sklearn.metrics import mean_absolute_error

import statistics as st
import lightgbm as lgbm
from scipy.stats import norm

In [2]:
sys.path.append('../../')
import src.utils as utils

## Param

In [3]:
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName', 'birthCity', 'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight']
rosters_cols = ['playerId', 'teamId', 'status', 'date']
transactions_cols = ['playerId', 'transaction_flag', 'date']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'date']

feature_cols1 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',
        "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
         "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt", 
        'daysSinceLastGame', 'season_info','team_home', 'team_gamePk', 'team_flyOuts',
       'team_groundOuts', 'team_runsScored', 'team_doubles', 'team_triples',
       'team_homeRuns', 'team_strikeOuts', 'team_baseOnBalls',
       'team_intentionalWalks', 'team_hits', 'team_hitByPitch', 'team_atBats',
       'team_caughtStealing', 'team_stolenBases', 'team_groundIntoDoublePlay',
       'team_groundIntoTriplePlay', 'team_plateAppearances', 'team_totalBases',
       'team_rbi', 'team_leftOnBase', 'team_sacBunts', 'team_sacFlies',
       'team_catchersInterference', 'team_pickoffs', 'team_airOutsPitching',
       'team_groundOutsPitching', 'team_runsPitching', 'team_doublesPitching',
       'team_triplesPitching', 'team_homeRunsPitching',
       'team_strikeOutsPitching', 'team_baseOnBallsPitching',
       'team_intentionalWalksPitching', 'team_hitsPitching',
       'team_hitByPitchPitching', 'team_atBatsPitching',
       'team_caughtStealingPitching', 'team_stolenBasesPitching',
       'team_inningsPitched', 'team_earnedRuns', 'team_battersFaced',
       'team_outsPitching', 'team_hitBatsmen', 'team_balks',
       'team_wildPitches', 'team_pickoffsPitching', 'team_rbiPitching',
       'team_inheritedRunners', 'team_inheritedRunnersScored',
       'team_catchersInterferencePitching', 'team_sacBuntsPitching',
       'team_sacFliesPitching'] 

feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',
        "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
         "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt", 
        'daysSinceLastGame', 'season_info', 'team_home', 'team_gamePk', 'team_flyOuts',
       'team_groundOuts', 'team_runsScored', 'team_doubles', 'team_triples',
       'team_homeRuns', 'team_strikeOuts', 'team_baseOnBalls',
       'team_intentionalWalks', 'team_hits', 'team_hitByPitch', 'team_atBats',
       'team_caughtStealing', 'team_stolenBases', 'team_groundIntoDoublePlay',
       'team_groundIntoTriplePlay', 'team_plateAppearances', 'team_totalBases',
       'team_rbi', 'team_leftOnBase', 'team_sacBunts', 'team_sacFlies',
       'team_catchersInterference', 'team_pickoffs', 'team_airOutsPitching',
       'team_groundOutsPitching', 'team_runsPitching', 'team_doublesPitching',
       'team_triplesPitching', 'team_homeRunsPitching',
       'team_strikeOutsPitching', 'team_baseOnBallsPitching',
       'team_intentionalWalksPitching', 'team_hitsPitching',
       'team_hitByPitchPitching', 'team_atBatsPitching',
       'team_caughtStealingPitching', 'team_stolenBasesPitching',
       'team_inningsPitched', 'team_earnedRuns', 'team_battersFaced',
       'team_outsPitching', 'team_hitBatsmen', 'team_balks',
       'team_wildPitches', 'team_pickoffsPitching', 'team_rbiPitching',
       'team_inheritedRunners', 'team_inheritedRunnersScored',
       'team_catchersInterferencePitching', 'team_sacBuntsPitching',
       'team_sacFliesPitching'] 

feature_cols3 = ['week_day','label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',
        "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
         "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt", 
        'daysSinceLastGame', 'season_info', 'team_home', 'team_gamePk', 'team_flyOuts',
       'team_groundOuts', 'team_runsScored', 'team_doubles', 'team_triples',
       'team_homeRuns', 'team_strikeOuts', 'team_baseOnBalls',
       'team_intentionalWalks', 'team_hits', 'team_hitByPitch', 'team_atBats',
       'team_caughtStealing', 'team_stolenBases', 'team_groundIntoDoublePlay',
       'team_groundIntoTriplePlay', 'team_plateAppearances', 'team_totalBases',
       'team_rbi', 'team_leftOnBase', 'team_sacBunts', 'team_sacFlies',
       'team_catchersInterference', 'team_pickoffs', 'team_airOutsPitching',
       'team_groundOutsPitching', 'team_runsPitching', 'team_doublesPitching',
       'team_triplesPitching', 'team_homeRunsPitching',
       'team_strikeOutsPitching', 'team_baseOnBallsPitching',
       'team_intentionalWalksPitching', 'team_hitsPitching',
       'team_hitByPitchPitching', 'team_atBatsPitching',
       'team_caughtStealingPitching', 'team_stolenBasesPitching',
       'team_inningsPitched', 'team_earnedRuns', 'team_battersFaced',
       'team_outsPitching', 'team_hitBatsmen', 'team_balks',
       'team_wildPitches', 'team_pickoffsPitching', 'team_rbiPitching',
       'team_inheritedRunners', 'team_inheritedRunnersScored',
       'team_catchersInterferencePitching', 'team_sacBuntsPitching',
       'team_sacFliesPitching'] 

feature_cols4 = ['week_day', 'annual_day', 'month', 'label_playerId', 'label_primaryPositionName', 'label_teamId', 'label_birthCity',
                'DOY', 'mlbDebutYear', 'DebutAge', 'heightInches', 'weight',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',
        "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_skew","target1_kurt",
         "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_skew","target2_kurt",
        "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_skew","target3_kurt",
        "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_skew","target4_kurt", 
        'daysSinceLastGame', 'season_info', 'team_home', 'team_gamePk', 'team_flyOuts',
       'team_groundOuts', 'team_runsScored', 'team_doubles', 'team_triples',
       'team_homeRuns', 'team_strikeOuts', 'team_baseOnBalls',
       'team_intentionalWalks', 'team_hits', 'team_hitByPitch', 'team_atBats',
       'team_caughtStealing', 'team_stolenBases', 'team_groundIntoDoublePlay',
       'team_groundIntoTriplePlay', 'team_plateAppearances', 'team_totalBases',
       'team_rbi', 'team_leftOnBase', 'team_sacBunts', 'team_sacFlies',
       'team_catchersInterference', 'team_pickoffs', 'team_airOutsPitching',
       'team_groundOutsPitching', 'team_runsPitching', 'team_doublesPitching',
       'team_triplesPitching', 'team_homeRunsPitching',
       'team_strikeOutsPitching', 'team_baseOnBallsPitching',
       'team_intentionalWalksPitching', 'team_hitsPitching',
       'team_hitByPitchPitching', 'team_atBatsPitching',
       'team_caughtStealingPitching', 'team_stolenBasesPitching',
       'team_inningsPitched', 'team_earnedRuns', 'team_battersFaced',
       'team_outsPitching', 'team_hitBatsmen', 'team_balks',
       'team_wildPitches', 'team_pickoffsPitching', 'team_rbiPitching',
       'team_inheritedRunners', 'team_inheritedRunnersScored',
       'team_catchersInterferencePitching', 'team_sacBuntsPitching',
       'team_sacFliesPitching'] 

In [4]:
# training lightgbm
params1 = {'objective':'mae',
           'reg_alpha': 0.14947461820098767, 
           'reg_lambda': 0.10185644384043743, 
           'n_estimators': 3633, 
           'learning_rate': 0.08046301304430488, 
           'num_leaves': 674, 
           'feature_fraction': 0.9101240539122566, 
           'bagging_fraction': 0.9884451442950513, 
           'bagging_freq': 8, 
           'min_child_samples': 51}

params2 = {'objective':'mae',
           'reg_alpha': 0.1,
           'reg_lambda': 0.1, 
           'n_estimators': 80,
           'learning_rate': 0.1,
           'random_state': 42,
           "num_leaves": 22}

params3 = {'objective':'mae',
           'reg_alpha': 0.1,
           'reg_lambda': 0.1, 
           'n_estimators': 10000,
           'learning_rate': 0.1,
           'random_state': 42,
           "num_leaves": 100}

params4 = {'objective':'mae',
           'reg_alpha': 0.016468100279441976, 
           'reg_lambda': 0.09128335764019105, 
           'n_estimators': 9868, 
           'learning_rate': 0.10528150510326864, 
           'num_leaves': 157, 
           'feature_fraction': 0.5419185713426886, 
           'bagging_fraction': 0.2637405128936662, 
           'bagging_freq': 19, 
           'min_child_samples': 71}

In [5]:
EXP_NUM = 24
NFOLDS = 5
SEED = 42

In [6]:
utils.set_seed(SEED)

## Dir

In [7]:
DATA_DIR = Path("/home/knikaido/work/MLB-Player-Digital-Engagement-Forecasting/data/")
MAIN_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting'
TRAIN_DIR = MAIN_DATA_DIR / 'train'
OUTPUT_DIR = Path('./output/')

In [8]:
players = pd.read_csv(MAIN_DATA_DIR / 'players.csv')

rosters = pd.read_csv(TRAIN_DIR / 'rosters_train.csv')
targets = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv')
scores = pd.read_csv(TRAIN_DIR / 'playerBoxScores_train.csv')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()
team_scores = pd.read_csv(TRAIN_DIR / 'teamBoxScores_train.csv')
team_scores = team_scores.groupby(['teamId', 'date']).sum().reset_index()

seasons = pd.read_csv(MAIN_DATA_DIR / 'seasons.csv')

# events = pd.read_csv(TRAIN_DIR / 'events_train.csv')
# events = events.groupby(['gameDate']).sum().reset_index()


In [9]:
c_1 = list(team_scores.iloc[:, :2].columns)
c_2 = list(team_scores.iloc[:, 2:].add_prefix('team_').columns)
team_scores.columns = c_1 + c_2
team_scores

Unnamed: 0,teamId,date,team_home,team_gamePk,team_flyOuts,team_groundOuts,team_runsScored,team_doubles,team_triples,team_homeRuns,...,team_balks,team_wildPitches,team_pickoffsPitching,team_rbiPitching,team_inheritedRunners,team_inheritedRunnersScored,team_catchersInterferencePitching,team_sacBuntsPitching,team_sacFliesPitching,team_index
0,108,20180329,0,529412,6,11,5,2,1,3,...,0,0,0,6,0,0,0,0,0,87
1,108,20180330,0,529425,5,9,2,0,0,1,...,0,0,0,1,0,0,0,0,0,88
2,108,20180331,0,529436,3,11,8,6,1,0,...,0,0,0,3,0,0,0,0,0,89
3,108,20180401,0,529450,5,12,7,4,0,0,...,0,0,0,4,0,0,0,0,0,90
4,108,20180402,1,529461,5,8,0,1,0,0,...,0,0,0,5,0,0,0,0,0,91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12253,158,20210430,1,634290,5,10,3,1,0,1,...,0,0,0,1,0,0,0,0,0,1215
12254,159,20180717,0,530856,5,7,8,0,0,5,...,0,0,0,6,0,0,0,0,0,197
12255,159,20190709,1,567633,4,10,4,2,0,1,...,0,0,0,3,0,0,0,0,0,554
12256,160,20180717,1,530856,9,7,6,1,0,5,...,0,0,0,8,0,0,0,0,1,197


In [10]:
seasons = seasons.fillna('0000-00-00')
for c_ in seasons.columns[1:]:
    seasons[c_] = seasons[c_].str.replace('-', '').astype(int)

In [11]:
seasons

Unnamed: 0,seasonId,seasonStartDate,seasonEndDate,preSeasonStartDate,preSeasonEndDate,regularSeasonStartDate,regularSeasonEndDate,lastDate1stHalf,allStarDate,firstDate2ndHalf,postSeasonStartDate,postSeasonEndDate
0,2017,20170402,20171101,20170222,20170401,20170402,20171001,20170709,20170711,20170714,20171003,20171101
1,2018,20180329,20181028,20180221,20180327,20180329,20181001,20180715,20180717,20180719,20181002,20181028
2,2019,20190320,20191030,20190221,20190326,20190320,20190929,20190707,20190709,20190711,20191001,20191030
3,2020,20200723,20201028,20200221,20200722,20200723,20200927,20200825,0,20200826,20200929,20201028
4,2021,20210228,20211031,20210228,20210330,20210401,20211003,20210711,20210713,20210715,20211004,20211031


In [12]:
targets_train = targets[(targets['date'] >= 20210401)]

In [13]:
targets_train

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,index,date
2444346,2021-04-02,624415,2.810640,3.998462,0.136137,10.012438,1186,20210401
2444347,2021-04-02,656548,0.000000,0.153787,0.068889,0.062189,1186,20210401
2444348,2021-04-02,400284,0.000971,0.010985,0.021323,3.182007,1186,20210401
2444349,2021-04-02,606157,0.002736,1.642226,0.021323,0.549337,1186,20210401
2444350,2021-04-02,665620,0.020473,0.411929,0.549469,0.435323,1186,20210401
...,...,...,...,...,...,...,...,...
2506171,2021-05-01,451661,0.000000,0.013314,0.000000,0.625925,1215,20210430
2506172,2021-05-01,519301,0.000131,0.003329,0.000000,0.216229,1215,20210430
2506173,2021-05-01,527055,0.000000,0.019971,0.000000,0.273131,1215,20210430
2506174,2021-05-01,543484,0.000131,0.056586,0.000000,1.024240,1215,20210430


In [14]:
playerId_list = targets_train['playerId'].unique()

In [15]:
def calc_corr(df):
    # 相関係数行列を作成
    corr_mat = df.corr(method='pearson')

    # 行（列）サイズを取得
    n = corr_mat.shape[0]
    corr_ary = []

    for i in range(n):
        for j in range(i):
            if i == j:
                continue
            corr_ary.append(corr_mat.iloc[i,j])

    return corr_ary

In [16]:
def calc_probs(pid,df,temp):
    to_append=[pid,'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','']
    targets=['target1','target2','target3','target4']
    z=1
    for target in targets:
        target_prob = temp[target].tolist()
        mean = np.mean(target_prob)
        std = np.std(target_prob)
        median = st.median(target_prob)
        distribution = norm(mean, std)
        min_weight = min(target_prob)
        max_weight = max(target_prob)
        values = list(np.linspace(min_weight, max_weight))
        probabilities = [distribution.pdf(v) for v in values]
        max_value = max(probabilities)
        max_index = probabilities.index(max_value)
        to_append[z]=mean
        to_append[z+1]=median
        to_append[z+2]=std
        to_append[z+3]=min_weight
        to_append[z+4]=max_weight
        to_append[z+5]=target_prob[max_index]
        to_append[z+6]=temp[target].skew()
        to_append[z+7]=temp[target].kurt()

        z=z+8
    corr_ = calc_corr(temp[['target1', 'target2', 'target3', 'target4']])
    to_append[z:] = corr_  
    df_length = len(df)
    df.loc[df_length] = to_append
    return df
    

### CREATE DATAFRAME to store probabilities
column_names = ["playerId", "target1_mean","target1_median","target1_std","target1_min","target1_max","target1_prob","target1_skew","target1_kurt",
                "target2_mean","target2_median","target2_std","target2_min","target2_max","target2_prob","target2_skew","target2_kurt",
                "target3_mean","target3_median","target3_std","target3_min","target3_max","target3_prob","target3_skew","target3_kurt",
                "target4_mean","target4_median","target4_std","target4_min","target4_max","target4_prob","target4_skew","target4_kurt",
                'tgt1_2_corr', 'tgt1_3_corr', 'tgt2_3_corr', 'tgt1_4_corr', 'tgt2_4_corr', 'tgt3_4_corr']
player_target_probs = pd.DataFrame(columns = column_names)
    
for pid in tqdm(playerId_list):
    temp = targets_train[targets_train['playerId'] == pid]
    player_target_stats=calc_probs(pid,player_target_probs,temp)

100%|██████████| 2061/2061 [00:46<00:00, 44.67it/s]


In [17]:
player_target_stats

Unnamed: 0,playerId,target1_mean,target1_median,target1_std,target1_min,target1_max,target1_prob,target1_skew,target1_kurt,target2_mean,...,target4_max,target4_prob,target4_skew,target4_kurt,tgt1_2_corr,tgt1_3_corr,tgt2_3_corr,tgt1_4_corr,tgt2_4_corr,tgt3_4_corr
0,624415.0,1.193031,0.489252,1.578499,0.008327,7.267988,0.785380,2.208850,6.071897,5.398585,...,26.025186,10.464387,1.695464,3.831404,0.408492,-0.009458,-0.101143,0.227373,0.694767,-0.039737
1,656548.0,0.110672,0.001015,0.428984,0.000000,2.392265,0.000000,5.272732,28.385845,0.445853,...,0.423573,0.032978,1.294196,1.140084,0.566411,0.588373,0.527886,0.231260,0.525065,0.132559
2,400284.0,0.001807,0.001301,0.001164,0.000437,0.004770,0.001127,1.155729,0.582752,0.073071,...,8.909871,7.882136,0.388255,-0.703079,0.102451,0.429495,0.046450,0.006606,0.277479,-0.016683
3,606157.0,0.748247,0.024461,1.318711,0.002736,4.840508,0.548613,2.003263,3.235374,2.027612,...,3.931330,0.231750,2.053818,4.653597,0.641382,0.093253,-0.002014,0.404372,0.626199,0.249109
4,665620.0,0.483331,0.020388,1.415708,0.004932,6.322554,0.128698,3.507992,11.893073,1.913634,...,8.474798,0.314951,4.910653,25.637658,0.807310,0.810125,0.965737,0.778467,0.954381,0.959152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2056,621350.0,0.000020,0.000000,0.000051,0.000000,0.000175,0.000175,2.331868,3.822563,0.007298,...,0.206009,0.017662,0.992179,0.627281,0.278321,,,-0.041837,0.060094,
2057,665759.0,0.000095,0.000000,0.000174,0.000000,0.000701,0.000000,2.033176,3.899046,0.238824,...,0.332631,0.019864,2.366135,7.230272,0.676003,,,0.678376,0.848313,
2058,448178.0,0.000020,0.000000,0.000052,0.000000,0.000174,0.000000,2.345681,3.902834,0.005605,...,0.350299,0.188265,2.416382,8.149779,-0.053275,-0.134373,-0.260662,-0.106479,-0.172380,-0.078929
2059,488681.0,0.000074,0.000000,0.000142,0.000000,0.000506,0.000000,2.153297,3.865929,0.012644,...,0.453328,0.075379,1.055577,1.166366,-0.114731,0.364257,0.095866,0.216720,0.301329,0.054098


In [18]:
data_names=player_target_stats.columns.values.tolist()

In [19]:
players['DOY'] = pd.to_datetime(players['DOB'], format="%Y-%m-%d").dt.year
players['mlbDebutYear'] = pd.to_datetime(players['mlbDebutDate'], format="%Y-%m-%d").dt.year
players['DebutAge'] = players['mlbDebutYear'] - players['DOY']

In [20]:
# creat dataset
train = targets[targets_cols].merge(players[players_cols], on=['playerId'], how='left')
train = train.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')
train = train.merge(scores[scores_cols], on=['playerId', 'date'], how='left')
train = train.merge(team_scores, on=['teamId', 'date'], how='left')
train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])


In [21]:
# label encoding
player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
birthCityn2num = {c: i for i, c in enumerate(train['birthCity'].unique())}
teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
status2num = {c: i for i, c in enumerate(train['status'].unique())}
train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['label_birthCity'] = train['birthCity'].map(birthCityn2num)
train['label_teamId'] = train['teamId'].map(teamid2num)
train['label_status'] = train['status'].map(status2num)

In [22]:
date_ = pd.to_datetime(train['date'], format="%Y%m%d")
train['annual_day'] = (date_ - pd.to_datetime(date_.dt.year, format="%Y")) /  timedelta(days=1)
train['week_day'] = date_.dt.weekday
train['month'] = date_.dt.month

In [23]:
train['gameday'] = ~train['battingOrder'].isna()*1

In [24]:
train.sort_values(by=['playerId','date'],inplace=True,ascending=True)

In [25]:
def count_consecutive_items_n_cols(df, col_name_list, output_col):
    cum_sum_list = [
        (df[col_name] != df[col_name].shift(1)).cumsum().tolist() for col_name in col_name_list
    ]
    df[output_col] = df.groupby(
        ["_".join(map(str, x)) for x in zip(*cum_sum_list)]
    ).cumcount() + 1
    return df

In [26]:
train=count_consecutive_items_n_cols(train,['playerId','gameday'],'daysSinceLastGame')
train.loc[train['gameday']==1,'daysSinceLastGame']=0

In [27]:
train_game = train[train['gameday']==1]
train_last_game = train_game[~train_game.duplicated(subset='playerId', keep='last')][['playerId', 'date']]
train_last_game.columns = ['playerId', 'lastdate']
train_player_unique = pd.DataFrame(train['playerId'].unique(), columns=['playerId'])
train_last_game = pd.merge(train_player_unique, train_last_game, on=['playerId'], how='left' )
train_last_game = train_last_game.fillna(20171231)

In [28]:
train_last_game.sort_values('lastdate')

Unnamed: 0,playerId,lastdate
856,593590,20171231.0
1995,670462,20171231.0
2000,670764,20171231.0
1765,661269,20171231.0
1932,667674,20171231.0
...,...,...
473,542932,20210430.0
1480,641856,20210430.0
1481,641857,20210430.0
483,543037,20210430.0


In [29]:
def extract_season(date_raw, season_start_end):
    idxes = 0
    for raw in season_start_end.iloc():
        idx_ = ((date_raw >= raw.iloc[0]) & (date_raw <= raw.iloc[1])) * 1
        idxes += idx_
    return idxes

In [30]:
## special_daysに2たてんのあやしいけど，軽く見た感じ全部シーズン中っぽいからいいかなと．
on_preseason_idxes = extract_season(train['date'], seasons[['preSeasonStartDate', 'preSeasonEndDate']])
on_season_idxes = extract_season(train['date'], seasons[['regularSeasonStartDate', 'regularSeasonEndDate']]) * 2
on_postseason_idxes = extract_season(train['date'], seasons[['postSeasonStartDate', 'postSeasonEndDate']]) * 3

special_days = seasons['lastDate1stHalf'].to_list() + seasons['allStarDate'].to_list() + seasons['firstDate2ndHalf'].to_list()
special_idxes = 0
for day in special_days:
    special_idxes += (train['date'] == day) * 2
on_total_season_idxes = on_preseason_idxes + on_season_idxes + on_postseason_idxes + special_idxes

train['season_info'] = on_total_season_idxes

In [31]:
## only on season
on_whole_idxes = extract_season(train['date'], seasons[['seasonStartDate', 'seasonEndDate']])
train = train[on_whole_idxes == 1].reset_index(drop=True)

In [32]:
train

Unnamed: 0,playerId,target1,target2,target3,target4,date,primaryPositionName,birthCity,DOY,mlbDebutYear,...,label_primaryPositionName,label_birthCity,label_teamId,label_status,annual_day,week_day,month,gameday,daysSinceLastGame,season_info
0,112526,0.031761,2.731418,0.388556,6.349412,20180329,Pitcher,Altamira,1973,1997.0,...,0,588,1,3,87.0,3,3,0,88,2
1,112526,0.025906,4.622162,0.408017,11.508375,20180330,Pitcher,Altamira,1973,1997.0,...,0,588,1,3,88.0,4,3,0,89,2
2,112526,0.053185,4.767842,0.275408,14.600851,20180331,Pitcher,Altamira,1973,1997.0,...,0,588,1,3,89.0,5,3,0,90,2
3,112526,0.771100,63.601677,7.566316,100.000000,20180401,Pitcher,Altamira,1973,1997.0,...,0,588,1,3,90.0,6,4,0,91,2
4,112526,5.957846,22.427930,33.900803,38.857939,20180402,Pitcher,Altamira,1973,1997.0,...,0,588,7,0,91.0,0,4,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1234534,685503,0.044617,1.224728,0.009437,0.737463,20210426,Pitcher,Hiroshima,1992,2021.0,...,0,902,7,0,115.0,0,4,0,1,2
1234535,685503,0.019123,1.178880,0.013161,0.790301,20210427,Pitcher,Hiroshima,1992,2021.0,...,0,902,7,0,116.0,1,4,0,2,2
1234536,685503,0.015799,4.323489,0.002350,0.970273,20210428,Pitcher,Hiroshima,1992,2021.0,...,0,902,7,0,117.0,2,4,0,3,2
1234537,685503,0.018770,31.946021,0.305491,5.938273,20210429,Pitcher,Hiroshima,1992,2021.0,...,0,902,7,0,118.0,3,4,0,4,2


In [33]:
# def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
#     oof_pred = np.zeros(len(y_valid), dtype=np.float32)
#     model = lgbm.LGBMRegressor(**params)
#     model.fit(x_train, y_train, 
#         eval_set=[(X_train, y_train), (x_valid, y_valid)],  
#         early_stopping_rounds=verbose, 
#         verbose=verbose)
#     oof_pred = model.predict(x_valid)
#     oof_pred = np.clip(oof_pred, 0, 100)
#     score = mean_absolute_error(oof_pred, y_valid)
#     print('mae:', score)
#     return oof_pred, model, score


In [34]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, verbose=100):
    def opt(trial):
        params = {
                'random_state': SEED,
                'objective':'mae',
                'n_estimators': 10000,
                'learning_rate': 0.1,
                'max_depth': trial.suggest_int('max_depth', 1, 20),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
                'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
                'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1),
                'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 1e3),
                'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 1e3),
                'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
                'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
                'bagging_freq': trial.suggest_int('bagging_freq', 1, 10)
        }

        model_opt = lgbm.LGBMRegressor(**params)

        model_opt.fit(x_train, y_train, 
            eval_set=[(x_train, y_train), (x_valid, y_valid)],  
            early_stopping_rounds=verbose, 
            verbose=verbose)
        oof_pred = model_opt.predict(x_valid)
        oof_pred = np.clip(oof_pred, 0, 100)
        score = mean_absolute_error(oof_pred, y_valid)
        return -score
    return opt

In [35]:
train_X = train
train_y = train[['target1', 'target2', 'target3', 'target4']]
tr_idxs = []
val_idxs = []

In [36]:
# tr_idx = (train['date'].astype(int) < 20200801)
# val_idx = (train['date'].astype(int) >= 20200801) & (train['date'].astype(int) < 20200901)
# tr_idxs.append(tr_idx)
# val_idxs.append(val_idx)

# tr_idx = (train['date'].astype(int) < 20200901)
# val_idx = (train['date'].astype(int) >= 20200901) & (train['date'].astype(int) < 20201001)
# tr_idxs.append(tr_idx)
# val_idxs.append(val_idx)

# tr_idx = (train['date'].astype(int) < 20201001)
# val_idx = (train['date'].astype(int) >= 20201001) & (train['date'].astype(int) < 20201028)
# tr_idxs.append(tr_idx)
# val_idxs.append(val_idx)

# tr_idx = (train['date'].astype(int) < 20210228)
# val_idx = (train['date'].astype(int) >= 20210228) & (train['date'].astype(int) < 20210401)
# tr_idxs.append(tr_idx)
# val_idxs.append(val_idx)

tr_idx = (train['date'].astype(int) < 20210401)
val_idx = ~tr_idx
tr_idxs.append(tr_idx)
val_idxs.append(val_idx)

In [37]:
idx = 0

tr_idx = tr_idxs[idx]
val_idx = val_idxs[idx]

x_train = train_X.loc[tr_idx].reset_index(drop=True)
y_train = train_y.loc[tr_idx].reset_index(drop=True)
x_valid = train_X.loc[val_idx].reset_index(drop=True)
y_valid = train_y.loc[val_idx].reset_index(drop=True)

In [None]:
study1 = optuna.create_study(direction='maximize')
study1.optimize(fit_lgbm(x_train[feature_cols1], y_train['target1'], x_valid[feature_cols1], y_valid['target1']), n_trials=100)

print('Number of finished trials:', len(study1.trials))
print('Best trial:', study1.best_trial.params)

[32m[I 2021-07-02 02:28:30,466][0m A new study created in memory with name: no-name-4252d832-6ecb-42b0-87c4-7a3e927ef355[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.63064	valid_1's l1: 0.718319
[200]	training's l1: 0.630082	valid_1's l1: 0.717152
[300]	training's l1: 0.629556	valid_1's l1: 0.715968
[400]	training's l1: 0.629076	valid_1's l1: 0.714722
[500]	training's l1: 0.62873	valid_1's l1: 0.714088
[600]	training's l1: 0.628314	valid_1's l1: 0.712826
[700]	training's l1: 0.62705	valid_1's l1: 0.709616
[800]	training's l1: 0.626287	valid_1's l1: 0.708517
[900]	training's l1: 0.62611	valid_1's l1: 0.708215
[1000]	training's l1: 0.625363	valid_1's l1: 0.706605
[1100]	training's l1: 0.624947	valid_1's l1: 0.705842
[1200]	training's l1: 0.624071	valid_1's l1: 0.703786
[1300]	training's l1: 0.623501	valid_1's l1: 0.701746
[1400]	training's l1: 0.621304	valid_1's l1: 0.697064
[1500]	training's l1: 0.621059	valid_1's l1: 0.696597
[1600]	training's l1: 0.619543	valid_1's l1: 0.692854
[1700]	training's l1: 0.619337	valid_1's l1: 0.692219
[1800]	training's l1: 0.618419	va

[32m[I 2021-07-02 02:30:03,484][0m Trial 0 finished with value: -0.6743400189870076 and parameters: {'max_depth': 15, 'min_child_weight': 9, 'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_lambda': 477.35679320158636, 'reg_alpha': 0.0496352565411633, 'feature_fraction': 0.911616167791991, 'bagging_fraction': 0.5176471510577777, 'bagging_freq': 6}. Best is trial 0 with value: -0.6743400189870076.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.627416	valid_1's l1: 0.712672
[200]	training's l1: 0.626436	valid_1's l1: 0.71134
[300]	training's l1: 0.62643	valid_1's l1: 0.711332
[400]	training's l1: 0.626204	valid_1's l1: 0.710977
[500]	training's l1: 0.624905	valid_1's l1: 0.707735
[600]	training's l1: 0.624257	valid_1's l1: 0.70614
[700]	training's l1: 0.623018	valid_1's l1: 0.703949
[800]	training's l1: 0.622894	valid_1's l1: 0.703846
[900]	training's l1: 0.622181	valid_1's l1: 0.702216
[1000]	training's l1: 0.619906	valid_1's l1: 0.695334
[1100]	training's l1: 0.618787	valid_1's l1: 0.694051
[1200]	training's l1: 0.618677	valid_1's l1: 0.693928
[1300]	training's l1: 0.617135	valid_1's l1: 0.690434
[1400]	training's l1: 0.615174	valid_1's l1: 0.685647
[1500]	training's l1: 0.615113	valid_1's l1: 0.68566
[1600]	training's l1: 0.613929	valid_1's l1: 0.683248
[1700]	training's l1: 0.612584	valid_1's l1: 0.680901
[1800]	training's l1: 0.612443	va

[32m[I 2021-07-02 02:32:15,023][0m Trial 1 finished with value: -0.6732241222775891 and parameters: {'max_depth': 9, 'min_child_weight': 8, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_lambda': 36.92312255121764, 'reg_alpha': 0.008548531707262996, 'feature_fraction': 0.4983101890857393, 'bagging_fraction': 0.99019312778526, 'bagging_freq': 4}. Best is trial 1 with value: -0.6732241222775891.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.68977	valid_1's l1: 0.800646
[200]	training's l1: 0.689557	valid_1's l1: 0.800448


[32m[I 2021-07-02 02:32:24,511][0m Trial 2 finished with value: -0.8004466675116623 and parameters: {'max_depth': 1, 'min_child_weight': 19, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_lambda': 841.1295399363831, 'reg_alpha': 34.80013149185787, 'feature_fraction': 0.825427708672593, 'bagging_fraction': 0.9379902519095581, 'bagging_freq': 3}. Best is trial 1 with value: -0.6732241222775891.[0m


Early stopping, best iteration is:
[136]	training's l1: 0.689559	valid_1's l1: 0.800447
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.629427	valid_1's l1: 0.715824
[200]	training's l1: 0.627709	valid_1's l1: 0.712038
[300]	training's l1: 0.626873	valid_1's l1: 0.710727
[400]	training's l1: 0.625601	valid_1's l1: 0.708321
[500]	training's l1: 0.624352	valid_1's l1: 0.705966
[600]	training's l1: 0.62403	valid_1's l1: 0.705109
[700]	training's l1: 0.622842	valid_1's l1: 0.702601
[800]	training's l1: 0.622168	valid_1's l1: 0.701278
[900]	training's l1: 0.621588	valid_1's l1: 0.700146
[1000]	training's l1: 0.620295	valid_1's l1: 0.696708
[1100]	training's l1: 0.618409	valid_1's l1: 0.692355
[1200]	training's l1: 0.61803	valid_1's l1: 0.691943
[1300]	training's l1: 0.617768	valid_1's l1: 0.691489
[1400]	training's l1: 0.617324	valid_1's l1: 0.690089
[1500]	training's l1: 0.615081	valid_1's l1: 0.685618
[1600]	training's l1: 0.614629	valid_1's l1: 0.6850

[32m[I 2021-07-02 02:33:50,073][0m Trial 3 finished with value: -0.6772126421315225 and parameters: {'max_depth': 15, 'min_child_weight': 10, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_lambda': 0.1789987224566226, 'reg_alpha': 10.977296224427047, 'feature_fraction': 0.9417886834343793, 'bagging_fraction': 0.8494563375017588, 'bagging_freq': 8}. Best is trial 1 with value: -0.6732241222775891.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.627682	valid_1's l1: 0.713535
[200]	training's l1: 0.626876	valid_1's l1: 0.712253
[300]	training's l1: 0.626321	valid_1's l1: 0.711046
[400]	training's l1: 0.625977	valid_1's l1: 0.710445
[500]	training's l1: 0.625357	valid_1's l1: 0.709409
[600]	training's l1: 0.62466	valid_1's l1: 0.707772
[700]	training's l1: 0.624032	valid_1's l1: 0.706583
[800]	training's l1: 0.621684	valid_1's l1: 0.700777
[900]	training's l1: 0.620805	valid_1's l1: 0.698736
[1000]	training's l1: 0.620448	valid_1's l1: 0.697954
[1100]	training's l1: 0.618913	valid_1's l1: 0.69436
[1200]	training's l1: 0.617588	valid_1's l1: 0.691752
[1300]	training's l1: 0.617246	valid_1's l1: 0.690947
[1400]	training's l1: 0.616855	valid_1's l1: 0.690298
[1500]	training's l1: 0.615994	valid_1's l1: 0.688353
[1600]	training's l1: 0.61598	valid_1's l1: 0.688359
Early stopping, best iteration is:
[1500]	training's l1: 0.615994	valid_1's l1: 0.6883

[32m[I 2021-07-02 02:35:02,040][0m Trial 4 finished with value: -0.6882989711430071 and parameters: {'max_depth': 17, 'min_child_weight': 9, 'subsample': 0.9, 'colsample_bytree': 0.8, 'reg_lambda': 169.79626690924948, 'reg_alpha': 3.757039264226494, 'feature_fraction': 0.512835981522278, 'bagging_fraction': 0.8225332591671457, 'bagging_freq': 3}. Best is trial 1 with value: -0.6732241222775891.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.627645	valid_1's l1: 0.713864
[200]	training's l1: 0.626799	valid_1's l1: 0.712288
[300]	training's l1: 0.625382	valid_1's l1: 0.710006
[400]	training's l1: 0.624351	valid_1's l1: 0.707716
[500]	training's l1: 0.621888	valid_1's l1: 0.702157
[600]	training's l1: 0.621369	valid_1's l1: 0.701152
[700]	training's l1: 0.620533	valid_1's l1: 0.699392
[800]	training's l1: 0.618691	valid_1's l1: 0.695707
[900]	training's l1: 0.617405	valid_1's l1: 0.692754
[1000]	training's l1: 0.617113	valid_1's l1: 0.692208
Early stopping, best iteration is:
[911]	training's l1: 0.617182	valid_1's l1: 0.6922


[32m[I 2021-07-02 02:35:50,968][0m Trial 5 finished with value: -0.6921806299106705 and parameters: {'max_depth': 16, 'min_child_weight': 9, 'subsample': 0.5, 'colsample_bytree': 0.9, 'reg_lambda': 0.0019616497729778005, 'reg_alpha': 0.0010228898756659201, 'feature_fraction': 0.43936176812268196, 'bagging_fraction': 0.8458134286441017, 'bagging_freq': 2}. Best is trial 1 with value: -0.6732241222775891.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.613879	valid_1's l1: 0.689737
[200]	training's l1: 0.609426	valid_1's l1: 0.681033
[300]	training's l1: 0.608373	valid_1's l1: 0.679391


[32m[I 2021-07-02 02:36:04,661][0m Trial 6 finished with value: -0.6793678965998671 and parameters: {'max_depth': 5, 'min_child_weight': 4, 'subsample': 0.7, 'colsample_bytree': 0.6, 'reg_lambda': 7.554044196726264, 'reg_alpha': 0.21827888765578224, 'feature_fraction': 0.6548129557604225, 'bagging_fraction': 0.5787341939423877, 'bagging_freq': 4}. Best is trial 1 with value: -0.6732241222775891.[0m


[400]	training's l1: 0.608355	valid_1's l1: 0.67941
Early stopping, best iteration is:
[300]	training's l1: 0.608373	valid_1's l1: 0.679391
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.629592	valid_1's l1: 0.717905
[200]	training's l1: 0.627894	valid_1's l1: 0.714997
[300]	training's l1: 0.626243	valid_1's l1: 0.710931


[32m[I 2021-07-02 02:36:17,451][0m Trial 7 finished with value: -0.7109168654809476 and parameters: {'max_depth': 20, 'min_child_weight': 16, 'subsample': 0.6, 'colsample_bytree': 0.8, 'reg_lambda': 0.282727938064432, 'reg_alpha': 19.625433168588472, 'feature_fraction': 0.857699679395576, 'bagging_fraction': 0.6695958114186136, 'bagging_freq': 5}. Best is trial 1 with value: -0.6732241222775891.[0m


Early stopping, best iteration is:
[241]	training's l1: 0.626251	valid_1's l1: 0.710923
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.629873	valid_1's l1: 0.717337
[200]	training's l1: 0.626594	valid_1's l1: 0.711843
[300]	training's l1: 0.625254	valid_1's l1: 0.709212
[400]	training's l1: 0.62524	valid_1's l1: 0.709137
[500]	training's l1: 0.625066	valid_1's l1: 0.708818
[600]	training's l1: 0.62482	valid_1's l1: 0.708451
[700]	training's l1: 0.622985	valid_1's l1: 0.704709
[800]	training's l1: 0.62256	valid_1's l1: 0.703548
[900]	training's l1: 0.620447	valid_1's l1: 0.699071
[1000]	training's l1: 0.620062	valid_1's l1: 0.69855
[1100]	training's l1: 0.616984	valid_1's l1: 0.691688
[1200]	training's l1: 0.614816	valid_1's l1: 0.686866
[1300]	training's l1: 0.614557	valid_1's l1: 0.686542
[1400]	training's l1: 0.613705	valid_1's l1: 0.685067
[1500]	training's l1: 0.611905	valid_1's l1: 0.682156
[1600]	training's l1: 0.611352	valid_1's l1: 0.681664

[32m[I 2021-07-02 02:37:34,308][0m Trial 8 finished with value: -0.6676589325408833 and parameters: {'max_depth': 15, 'min_child_weight': 8, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_lambda': 6.891885288333164, 'reg_alpha': 0.7470737264497586, 'feature_fraction': 0.8035365872809268, 'bagging_fraction': 0.5742210995535293, 'bagging_freq': 5}. Best is trial 8 with value: -0.6676589325408833.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.620568	valid_1's l1: 0.70199
[200]	training's l1: 0.620269	valid_1's l1: 0.701474


[32m[I 2021-07-02 02:37:48,289][0m Trial 9 finished with value: -0.7014627827555353 and parameters: {'max_depth': 6, 'min_child_weight': 20, 'subsample': 0.7, 'colsample_bytree': 0.7, 'reg_lambda': 0.009678671431832134, 'reg_alpha': 0.8564403634745802, 'feature_fraction': 0.5931960444231068, 'bagging_fraction': 0.818150635860305, 'bagging_freq': 7}. Best is trial 8 with value: -0.6676589325408833.[0m


Early stopping, best iteration is:
[199]	training's l1: 0.620269	valid_1's l1: 0.701474
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.628535	valid_1's l1: 0.715866
[200]	training's l1: 0.625776	valid_1's l1: 0.710743
[300]	training's l1: 0.625234	valid_1's l1: 0.709665
[400]	training's l1: 0.624839	valid_1's l1: 0.70855
[500]	training's l1: 0.624487	valid_1's l1: 0.707959
[600]	training's l1: 0.62316	valid_1's l1: 0.70541
[700]	training's l1: 0.621205	valid_1's l1: 0.700989
[800]	training's l1: 0.620429	valid_1's l1: 0.698783
[900]	training's l1: 0.619266	valid_1's l1: 0.696188
[1000]	training's l1: 0.618817	valid_1's l1: 0.694924
[1100]	training's l1: 0.617642	valid_1's l1: 0.692011
[1200]	training's l1: 0.615556	valid_1's l1: 0.686618
[1300]	training's l1: 0.615385	valid_1's l1: 0.686377
[1400]	training's l1: 0.613969	valid_1's l1: 0.684719
[1500]	training's l1: 0.61377	valid_1's l1: 0.684415
[1600]	training's l1: 0.612411	valid_1's l1: 0.681889

[32m[I 2021-07-02 02:38:50,481][0m Trial 10 finished with value: -0.6681227049315465 and parameters: {'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.5, 'colsample_bytree': 0.7, 'reg_lambda': 3.262941677893662, 'reg_alpha': 0.01410372377536586, 'feature_fraction': 0.7517588873594511, 'bagging_fraction': 0.434417589024782, 'bagging_freq': 10}. Best is trial 8 with value: -0.6676589325408833.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.628866	valid_1's l1: 0.715119
[200]	training's l1: 0.625396	valid_1's l1: 0.708971
[300]	training's l1: 0.624824	valid_1's l1: 0.707824
[400]	training's l1: 0.624204	valid_1's l1: 0.706757
[500]	training's l1: 0.623887	valid_1's l1: 0.706004
[600]	training's l1: 0.623722	valid_1's l1: 0.705731
[700]	training's l1: 0.622121	valid_1's l1: 0.703504
[800]	training's l1: 0.621848	valid_1's l1: 0.702529
[900]	training's l1: 0.619494	valid_1's l1: 0.697421
[1000]	training's l1: 0.618502	valid_1's l1: 0.695093
[1100]	training's l1: 0.618334	valid_1's l1: 0.694928
[1200]	training's l1: 0.615449	valid_1's l1: 0.687775
[1300]	training's l1: 0.614377	valid_1's l1: 0.685916
[1400]	training's l1: 0.613608	valid_1's l1: 0.684377
[1500]	training's l1: 0.613019	valid_1's l1: 0.683824
[1600]	training's l1: 0.612436	valid_1's l1: 0.68338
[1700]	training's l1: 0.611535	valid_1's l1: 0.681166
[1800]	training's l1: 0.610987

[32m[I 2021-07-02 02:39:52,314][0m Trial 11 finished with value: -0.6657013826674919 and parameters: {'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.5, 'colsample_bytree': 0.7, 'reg_lambda': 4.151155087167255, 'reg_alpha': 0.0023910194071052504, 'feature_fraction': 0.7519515015307318, 'bagging_fraction': 0.41916309454116524, 'bagging_freq': 9}. Best is trial 11 with value: -0.6657013826674919.[0m


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.633153	valid_1's l1: 0.722307
[200]	training's l1: 0.631649	valid_1's l1: 0.718379
[300]	training's l1: 0.631643	valid_1's l1: 0.718376
[400]	training's l1: 0.631452	valid_1's l1: 0.717709
[500]	training's l1: 0.631238	valid_1's l1: 0.717339
[600]	training's l1: 0.631091	valid_1's l1: 0.717232
[700]	training's l1: 0.631005	valid_1's l1: 0.717145
[800]	training's l1: 0.630489	valid_1's l1: 0.716009
[900]	training's l1: 0.630421	valid_1's l1: 0.715881
[1000]	training's l1: 0.630287	valid_1's l1: 0.715554
[1100]	training's l1: 0.629396	valid_1's l1: 0.713383
[1200]	training's l1: 0.628919	valid_1's l1: 0.712215
[1300]	training's l1: 0.628717	valid_1's l1: 0.711598
[1400]	training's l1: 0.628039	valid_1's l1: 0.710437
[1500]	training's l1: 0.627871	valid_1's l1: 0.710242
[1600]	training's l1: 0.627332	valid_1's l1: 0.708872
[1700]	training's l1: 0.626678	valid_1's l1: 0.70693
[1800]	training's l1: 0.626497

In [None]:
study2 = optuna.create_study(direction='maximize')
study2.optimize(fit_lgbm(x_train[feature_cols2], y_train['target2'], x_valid[feature_cols2], y_valid['target2']), n_trials=100)

print('Number of finished trials:', len(study2.trials))
print('Best trial:', study2.best_trial.params)

In [None]:
study3 = optuna.create_study(direction='maximize')
study3.optimize(fit_lgbm(x_train[feature_cols3], y_train['target3'], x_valid[feature_cols3], y_valid['target3']), n_trials=100)

print('Number of finished trials:', len(study3.trials))
print('Best trial:', study3.best_trial.params)

In [None]:
study4 = optuna.create_study(direction='maximize')
study4.optimize(fit_lgbm(x_train[feature_cols4], y_train['target4'], x_valid[feature_cols4], y_valid['target4']), n_trials=100)

print('Number of finished trials:', len(study4.trials))
print('Best trial:', study4.best_trial.params)

In [46]:
study1.best_trial.params

{'max_depth': 15,
 'min_child_weight': 4,
 'subsample': 0.8,
 'colsample_bytree': 0.9,
 'reg_lambda': 0.08816445137092141,
 'reg_alpha': 0.2934263924023268,
 'feature_fraction': 0.434947226813941,
 'bagging_fraction': 0.45804964684844063,
 'bagging_freq': 2}

In [47]:
study2.best_trial.params

{'max_depth': 11,
 'min_child_weight': 13,
 'subsample': 0.8,
 'colsample_bytree': 0.7,
 'reg_lambda': 1.9419823934346758,
 'reg_alpha': 2.7037110710804404,
 'feature_fraction': 0.822626250898438,
 'bagging_fraction': 0.8245871025164341,
 'bagging_freq': 5}

In [48]:
study3.best_trial.params

{'max_depth': 5,
 'min_child_weight': 4,
 'subsample': 0.5,
 'colsample_bytree': 0.7,
 'reg_lambda': 0.028252289984728996,
 'reg_alpha': 0.0023278552931023455,
 'feature_fraction': 0.6504234701019251,
 'bagging_fraction': 0.9640411868574804,
 'bagging_freq': 3}

In [49]:
study4.best_trial.params

{'max_depth': 14,
 'min_child_weight': 4,
 'subsample': 0.9,
 'colsample_bytree': 0.8,
 'reg_lambda': 102.09294512298051,
 'reg_alpha': 0.664112467203109,
 'feature_fraction': 0.6207866754007707,
 'bagging_fraction': 0.813578665176614,
 'bagging_freq': 2}