## About Dataset

In [None]:
modelsver="july-modelsver2"
version = "july_ver2"

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def unpack_json(df,var):
    name = var
    tmp = df[['date']+[name]].dropna(how='any').reset_index(drop=True)
    temp=[]
    for val in tmp[name]:
        temp.append(json.loads(val))
    tmp[name]=temp
    
    var_val_dict = dict(zip(tmp.iloc[0,1][0].keys(),[[] for val in tmp.iloc[0,1][0].keys()]))
    date_list = []

    for d, eng in zip(tmp['date'].values,tmp[name].values): #dfの1行ずつでループ
        for val in eng: #1セルのリストの要素ごとにループ
            date_list.append(d)
            for key in val.keys(): #リストの1要素が辞書になっており、辞書内のキーでループ
                var_val_dict[key].append(val[key])
                #var_val_dict['date_tr'].append(id1)
                #tqdm(zip(df['date'].values,df[name].values))

    output_df = pd.DataFrame(var_val_dict)
    output_df['date'] = date_list
    gc.collect()
    
    return output_df

## Training

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
import lightgbm as lgbm
from mlb.competition import make_env
from datetime import timedelta
import os
import gc
import pickle
import json

pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 50)

In [None]:
BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
TRAIN_DIR = Path('../input/train-update')

In [None]:
train_tmp = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/train_updated.csv')
targets = unpack_json(train_tmp,'nextDayPlayerEngagement')
rosters = unpack_json(train_tmp,'rosters')
scores_tmp = unpack_json(train_tmp,'playerBoxScores')
games = unpack_json(train_tmp,'games')
teamBoxScores = unpack_json(train_tmp,'teamBoxScores')
players = pd.read_csv("../input/playerscsv/NEWplayers.csv")
scores = scores_tmp.groupby(['playerId', 'date']).sum().reset_index()
teams = pd.read_csv(BASE_DIR / 'teams.csv')
seasons = pd.read_csv(BASE_DIR / 'seasons.csv')

In [None]:
del train_tmp

In [None]:
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName', "mlbDebutDate","DOB"]
rosters_cols = ['playerId', 'teamId', 'status', 'date']
scores_cols = ["gamePk", 'playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'date']

feature_cols = ['label_playerId',
 'label_primaryPositionName',
 'label_teamId',
 'label_status',
 'battingOrder',
 'gamesPlayedBatting',
 'runsScored',
 'homeRuns',
 'baseOnBalls',
 'hits',
 'plateAppearances',
 'totalBases',
 'rbi',
 'sacFlies',
 'gamesPlayedPitching',
 'gamesStartedPitching',
 'winsPitching',
 'lossesPitching',
 'runsPitching',
 'strikeOutsPitching',
 'inningsPitched',
 'saveOpportunities',
 'earnedRuns',
 'battersFaced',
 'pitchesThrown',
 'balls',
 'strikes',
 'rbiPitching',
 'gamesFinishedPitching',
 'saves',
 'holds',
 'assists',
 'putOuts',
 'chances',
 'target1_mean',
 'target1_median',
 'target1_std',
 'target1_min',
 'target1_max',
 'target1_prob',
 'target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max',
 'target2_prob',
 'target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob',
 'target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob',
 'gamesStartedPitching_shift_one',
 'gamesStartedPitching_shift_two',
 'gamesStartedPitching_shift_three',
 'gamesStartedPitching_shift_four',
 'gamesStartedPitching_shift_five',
 'gamesStartedPitching_shift_six',
 'gamesStartedPitching_shift_seven',
 'gamesStartedPitching_shift_eight',
 'gamesStartedPitching_shift_nine',
 'gamesStartedPitching_shift_ten',
 'gamesStartedPitching_shift_ele',
 'gamesStartedPitching_shift_twe',
 'gamesStartedPitching_shift_thirteen',
 'gamesStartedPitching_shift_fourteen',
 'gamesStartedPitching_shift_fifteen',
'target1_shift_three', 'target2_shift_three', 'target3_shift_three',
'target4_shift_three', 'target1_shift_four', 'target2_shift_four',
'target3_shift_four', 'target4_shift_four', 'target1_shift_five',
'target2_shift_five', 'target3_shift_five', 'target4_shift_five',
'target1_shift_six', 'target2_shift_six', 'target3_shift_six',
'target4_shift_six', 'target1_shift_seven', 'target2_shift_seven',
'target3_shift_seven', 'target4_shift_seven',
                
 'batter_contribution',
 'batter_contribution_rolling_mean_t3',
 'batter_contribution_rolling_mean_t5',
 'batter_contribution_rolling_mean_t7',
 'batter_contribution_rolling_mean_t10',
 'batter_contribution_rolling_mean_t15',
 'batter_contribution_rolling_std_t3',
 'batter_contribution_rolling_std_t5',
 'batter_contribution_rolling_std_t7',
 'batter_contribution_rolling_std_t10',
 'batter_contribution_rolling_std_t15',
 'pitchingGameScore',
 'pitchingGameScore_rolling_mean_t7',
 'pitchingGameScore_rolling_mean_t14',
 'pitchingGameScore_rolling_mean_t28',
 'pitchingGameScore_rolling_std_t7',
 'pitchingGameScore_rolling_std_t14',
 'pitchingGameScore_rolling_std_t28',
#  'numberOfFollowers',
 'isgame_schedule_tom',
 'isgame_schedule',
 'starter_or_not',
#  'player_age',
#  'mlbDebutDate_passed',
 'HappyBirthDay',
 'stater_or_not_mean_t7',
 'stater_or_not_mean_t14',
 'stater_or_not_mean_t28'          
]

feature_cols2=feature_cols+["target1"]

In [None]:
player_target_stats = pd.read_csv("../input/player-target-stats/player_target_stats.csv")
data_names=player_target_stats.columns.values.tolist()
# data_names

# trainのデータ作成

In [None]:
# creat dataset
train = targets[targets_cols].merge(players[players_cols], on=['playerId'], how='left')
print("0",train.shape)
train = train.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')
train = train.merge(scores[scores_cols], on=['playerId', 'date'], how='left')
train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])
print("1",train.shape)

train = train.merge(teams, how='left', left_on=["teamId"],right_on=["id"])
train["year_month"] = pd.to_datetime(train['date'], format="%Y%m%d").map(lambda x:str(x.year) + str(x.month).zfill(2)).astype(int)
print("2",train.shape)
train=train.rename(columns={"date_x": "date"})

In [None]:
#https://blog.amedama.jp/entry/2017/10/10/135331
#https://ohke.hateblo.jp/entry/2020/08/15/150000
games_schdule = pd.read_parquet('../input/mlb-schedule/schedule.parquet')
games_schdule["date"]=pd.to_datetime(games_schdule['date'], format="%Y-%m-%d").map(lambda x:str(x.year) + str(x.month).zfill(2)+ str(x.day).zfill(2)).astype(int)
games_schdule["before_date"]=(pd.to_datetime(games_schdule['date'], format="%Y%m%d") + timedelta(days=-1)).map(lambda x:str(x.year) + str(x.month).zfill(2)+ str(x.day).zfill(2)).astype(int)
games_schdule["isgame_schedule"]=1


temas_dict={'CIN': 'CIN','BOS': 'BOS', 'TEX': 'TEX', 'SEA': 'SEA', 'CHC': 'CHC', 'ARI': 'ARI', 'STL': 'STL', 'PHI': 'PHI',
 'COL': 'COL', 'TOR': 'TOR', 'CLE': 'CLE', 'LAA': 'LAA', 'SFG': 'SF', 'ATL': 'ATL', 'KCR': 'KC', 'BAL': 'BAL', 'MIN': 'MIN',
 'HOU': 'HOU', 'SDP': 'SD', 'MIA': 'MIA', 'MIL': 'MIL', 'TBR': 'TB','CHW': 'CWS', 'WSN': 'WSH', 'NYM': 'NYM', 'NYY': 'NYY', 
 'LAD': 'LAD', 'OAK': 'OAK', 'DET': 'DET', 'PIT': 'PIT'}

games_schdule.Tm = games_schdule.Tm.map(temas_dict)

games_schdule.drop_duplicates(["date", "Tm"], inplace=True)

In [None]:
train = train.merge(games_schdule[['Tm', "before_date", "isgame_schedule"]],left_on = ['date','abbreviation'],right_on = ['before_date','Tm'], how = 'left')
print("4",train.shape)
train=train.rename(columns={"isgame_schedule": "isgame_schedule_tom"})
train = train.drop(["Tm", "before_date"], axis=1)
train = train.merge(games_schdule[['Tm', "date", "isgame_schedule"]],left_on = ['date','abbreviation'],right_on = ['date','Tm'], how = 'left')
print("5",train.shape)

train["isgame_schedule"]=train["isgame_schedule"].fillna(0)
train["isgame_schedule_tom"]=train["isgame_schedule_tom"].fillna(0)

In [None]:
regualar_season_index=train[(pd.to_datetime(train['date'], format='%Y%m%d') > "2017-04-02") & (pd.to_datetime(train['date'], format='%Y%m%d') < "2017-10-01") |
(pd.to_datetime(train['date'], format='%Y%m%d') > "2018-03-29") & (pd.to_datetime(train['date'], format='%Y%m%d') < "2018-10-01") |
(pd.to_datetime(train['date'], format='%Y%m%d') > "2019-03-29") & (pd.to_datetime(train['date'], format='%Y%m%d') < "2019-09-29") |
(pd.to_datetime(train['date'], format='%Y%m%d') > "2020-07-23") & (pd.to_datetime(train['date'], format='%Y%m%d') < "2020-09-27") |
(pd.to_datetime(train['date'], format='%Y%m%d') > "2021-04-01") & (pd.to_datetime(train['date'], format='%Y%m%d') < "2021-10-03")][["isgame_schedule_tom", "isgame_schedule"]].index

In [None]:
train['HappyBirthDay'] = train['date'].map(lambda x: str(x)[-4:]) == (pd.to_datetime(train["DOB"], format='%Y-%m-%d')+ timedelta(days=-1)).map(lambda x: x.strftime('%m%d'))
train["starter_or_not"] = train["gamesPlayedPitching"] + train["gamesStartedPitching"]

In [None]:
# country2num = {c: i for i, c in enumerate(train['birthCountry'].unique())}
player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
status2num = {c: i for i, c in enumerate(train['status'].unique())}
# train['label_country_id'] = train['birthCountry'].map(country2num)
train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['label_teamId'] = train['teamId'].map(teamid2num)
train['label_status'] = train['status'].map(status2num)

# trainの特徴量エンジニアリング

In [None]:
train["gamesStartedPitching_shift_one"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(1))
train["gamesStartedPitching_shift_two"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(2))
train["gamesStartedPitching_shift_three"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(3))
train["gamesStartedPitching_shift_four"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(4))
train["gamesStartedPitching_shift_five"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(5))
train["gamesStartedPitching_shift_six"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(6))
train["gamesStartedPitching_shift_seven"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(7))
train["gamesStartedPitching_shift_eight"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(8))
train["gamesStartedPitching_shift_nine"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(9))
train["gamesStartedPitching_shift_ten"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(10))
train["gamesStartedPitching_shift_ele"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(11))
train["gamesStartedPitching_shift_twe"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(12))
train["gamesStartedPitching_shift_thirteen"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(13))
train["gamesStartedPitching_shift_fourteen"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(14))
train["gamesStartedPitching_shift_fifteen"]=train.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(15))

In [None]:
train['pitchingGameScore'] = (40
    + 2 * train['outsPitching']
    + 1 * train['strikeOutsPitching']
    - 2 * train['baseOnBallsPitching']
    - 2 * train['hitsPitching']
    - 3 * train['runsPitching']
    - 6 * train['homeRunsPitching']
    )
# https://maddog31.xyz/baseball-web/contribution_degree/contribution_degree2/
train["batter_contribution"]=train["totalBases"]+train["baseOnBalls"]+train["intentionalWalks"]+train["hitByPitch"]+train["stolenBases"]+\
train["sacBunts"]*0.5+train["sacFlies"]*0.5+train["runsScored"]*0.25+train["rbi"]*0.25-\
train["groundIntoDoublePlay"]-train["caughtStealing"]

train["pitchingGameScore_rolling_mean_t7"]=train.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
train["pitchingGameScore_rolling_mean_t14"]=train.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(14, min_periods=2).mean())
train["pitchingGameScore_rolling_mean_t28"]=train.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(28, min_periods=3).mean())
train["pitchingGameScore_rolling_std_t7"]=train.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(7, min_periods=1).std())
train["pitchingGameScore_rolling_std_t14"]=train.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(14, min_periods=2).std())
train["pitchingGameScore_rolling_std_t28"]=train.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(28, min_periods=3).std())

train["batter_contribution_rolling_mean_t3"]=train.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(3, min_periods=2).mean())
train["batter_contribution_rolling_mean_t5"]=train.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(5, min_periods=3).mean())
train["batter_contribution_rolling_mean_t7"]=train.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(7, min_periods=5).mean())
train["batter_contribution_rolling_mean_t10"]=train.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(10, min_periods=6).mean())
train["batter_contribution_rolling_mean_t15"]=train.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(15, min_periods=7).mean())
train["batter_contribution_rolling_std_t3"]=train.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(3, min_periods=2).std())
train["batter_contribution_rolling_std_t5"]=train.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(5, min_periods=3).std())
train["batter_contribution_rolling_std_t7"]=train.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(7, min_periods=5).std())
train["batter_contribution_rolling_std_t10"]=train.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(10, min_periods=6).std())
train["batter_contribution_rolling_std_t15"]=train.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(15, min_periods=7).std())

In [None]:
train["stater_or_not_mean_t7"]=train.groupby(["playerId"])["starter_or_not"].transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
train["stater_or_not_mean_t14"]=train.groupby(["playerId"])["starter_or_not"].transform(lambda x: x.shift(1).rolling(14, min_periods=2).mean())
train["stater_or_not_mean_t28"]=train.groupby(["playerId"])["starter_or_not"].transform(lambda x: x.shift(1).rolling(28, min_periods=4).mean())

In [None]:
gc.collect()

In [None]:
train["target1_shift_three"]=train.groupby(["playerId"])["target1"].transform(lambda x:x.shift(3))
train["target2_shift_three"]=train.groupby(["playerId"])["target2"].transform(lambda x:x.shift(3))
train["target3_shift_three"]=train.groupby(["playerId"])["target3"].transform(lambda x:x.shift(3))
train["target4_shift_three"]=train.groupby(["playerId"])["target4"].transform(lambda x:x.shift(3))

train["target1_shift_four"]=train.groupby(["playerId"])["target1"].transform(lambda x:x.shift(4))
train["target2_shift_four"]=train.groupby(["playerId"])["target2"].transform(lambda x:x.shift(4))
train["target3_shift_four"]=train.groupby(["playerId"])["target3"].transform(lambda x:x.shift(4))
train["target4_shift_four"]=train.groupby(["playerId"])["target4"].transform(lambda x:x.shift(4))

train["target1_shift_five"]=train.groupby(["playerId"])["target1"].transform(lambda x:x.shift(5))
train["target2_shift_five"]=train.groupby(["playerId"])["target2"].transform(lambda x:x.shift(5))
train["target3_shift_five"]=train.groupby(["playerId"])["target3"].transform(lambda x:x.shift(5))
train["target4_shift_five"]=train.groupby(["playerId"])["target4"].transform(lambda x:x.shift(5))

train["target1_shift_six"]=train.groupby(["playerId"])["target1"].transform(lambda x:x.shift(6))
train["target2_shift_six"]=train.groupby(["playerId"])["target2"].transform(lambda x:x.shift(6))
train["target3_shift_six"]=train.groupby(["playerId"])["target3"].transform(lambda x:x.shift(6))
train["target4_shift_six"]=train.groupby(["playerId"])["target4"].transform(lambda x:x.shift(6))

train["target1_shift_seven"]=train.groupby(["playerId"])["target1"].transform(lambda x:x.shift(7))
train["target2_shift_seven"]=train.groupby(["playerId"])["target2"].transform(lambda x:x.shift(7))
train["target3_shift_seven"]=train.groupby(["playerId"])["target3"].transform(lambda x:x.shift(7))
train["target4_shift_seven"]=train.groupby(["playerId"])["target4"].transform(lambda x:x.shift(7))

# 推論時に消す(下2つ)

In [None]:
train=reduce_mem_usage(train)

In [None]:
train = train.iloc[regualar_season_index].reset_index(drop=True)

In [None]:
train_X = train[feature_cols]
train_y = train[['target1', 'target2', 'target3', 'target4']]

_index = (train['date'] < 20210701)
x_train1 = train_X.loc[_index].reset_index(drop=True)
y_train1 = train_y.loc[_index].reset_index(drop=True)
x_valid1 = train_X.loc[~_index].reset_index(drop=True)
y_valid1 = train_y.loc[~_index].reset_index(drop=True)

In [None]:
train_X = train[feature_cols2]
train_y = train[['target1', 'target2', 'target3', 'target4']]

_index = (train['date'] < 20210701)
x_train2 = train_X.loc[_index].reset_index(drop=True)
y_train2 = train_y.loc[_index].reset_index(drop=True)
x_valid2 = train_X.loc[~_index].reset_index(drop=True)
y_valid2 = train_y.loc[~_index].reset_index(drop=True)

In [None]:
#推論時にon
test_first = train.copy()
test_first=test_first[(test_first.date > 20210101)].reset_index(drop=True)

In [None]:
del train_X
del train_y
del train

gc.collect()

In [None]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = lgbm.LGBMRegressor(**params)
    model.fit(x_train, y_train, 
        eval_set=[(x_valid, y_valid)],  
        early_stopping_rounds=verbose, 
        verbose=verbose)
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score


# training lightgbm

params1 = {'objective':'mae',
           'reg_alpha': 0.14947461820098767, 
           'reg_lambda': 0.10185644384043743, 
           'n_estimators': 3633, 
           'learning_rate': 0.08046301304430488, 
           'num_leaves': 674, 
           'feature_fraction': 0.9101240539122566, 
           'bagging_fraction': 0.9884451442950513, 
           'bagging_freq': 8, 
           'min_child_samples': 51}

params2 = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 80,
 'learning_rate': 0.1,
 'random_state': 42,
 "num_leaves": 22
}

params4 = {'objective':'mae',
           'reg_alpha': 0.016468100279441976, 
           'reg_lambda': 0.09128335764019105, 
           'n_estimators': 9868, 
           'learning_rate': 0.10528150510326864, 
           'num_leaves': 157, 
           'feature_fraction': 0.5419185713426886, 
           'bagging_fraction': 0.2637405128936662, 
           'bagging_freq': 19, 
           'min_child_samples': 71}


params = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 10000,
 'learning_rate': 0.1,
 'random_state': 42,
 "num_leaves": 100
}


oof1, model1, score1 = fit_lgbm(
    x_train1, y_train1['target1'],
    x_valid1, y_valid1['target1'],
    params1
 )

oof2, model2, score2 = fit_lgbm(
    x_train2, y_train2['target2'],
    x_valid2, y_valid2['target2'],
    params2
)

oof3, model3, score3 = fit_lgbm(
    x_train2, y_train2['target3'],
    x_valid2, y_valid2['target3'],
   params
)

oof4, model4, score4 = fit_lgbm(
    x_train2, y_train2['target4'],
    x_valid2, y_valid2['target4'],
    params4
)

score = (score1+score2+score3+score4) / 4
print(f'score: {score}')

In [None]:
# stage="stage1"
# with open(f'model_lgb_{stage}_{version}_1.pkl', 'wb') as handle:
#     pickle.dump(model1, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open(f'model_lgb_{stage}_{version}_2.pkl', 'wb') as handle:
#     pickle.dump(model2, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open(f'model_lgb_{stage}_{version}_3.pkl', 'wb') as handle:
#     pickle.dump(model3, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open(f'model_lgb_{stage}_{version}_4.pkl', 'wb') as handle:
#     pickle.dump(model4, handle, protocol=pickle.HIGHEST_PROTOCOL)

# 推論時に消す(下1つ)

In [None]:
del x_train2
del y_train2
del x_valid2
del y_valid2
gc.collect()

In [None]:
# stage="stage1"
# with open(f'../input/{modelsver}/model_lgb_{stage}_{version}_1.pkl', 'rb') as handle:
#     model1=pickle.load(handle)
# with open(f'../input/{modelsver}/model_lgb_{stage}_{version}_2.pkl', 'rb') as handle:
#     model2=pickle.load(handle)
# with open(f'../input/{modelsver}/model_lgb_{stage}_{version}_3.pkl', 'rb') as handle:
#     model3=pickle.load(handle)    
# with open(f'../input/{modelsver}/model_lgb_{stage}_{version}_4.pkl', 'rb') as handle:
#     model4=pickle.load(handle)

In [None]:
stage="stage2"

In [None]:
import pickle
from catboost import CatBoostRegressor

def fit_lgbm(x_train, y_train, x_valid, y_valid, target, params: dict=None, verbose=100):
    oof_pred_lgb = np.zeros(len(y_valid), dtype=np.float32)
    oof_pred_cat = np.zeros(len(y_valid), dtype=np.float32)
    
#     if os.path.isfile(f'../input/mlb-lgbm-and-catboost-models/model_lgb_{target}.pkl'):
#         with open(f'../input/mlb-lgbm-and-catboost-models/model_lgb_{stage}_{version}_{target}.pkl', 'rb') as fin:
#             model = pickle.load(fin)
#     else:
    
    model = lgbm.LGBMRegressor(**params)
    model.fit(x_train, y_train, 
        eval_set=[(x_valid, y_valid)],  
        early_stopping_rounds=verbose, 
        verbose=verbose)

    with open(f'model_lgb_{stage}_{version}_{target}.pkl', 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    oof_pred_lgb = model.predict(x_valid)
    score_lgb = mean_absolute_error(oof_pred_lgb, y_valid)
    print('mae:', score_lgb)
    
#     if os.path.isfile(f'../input/mlb-lgbm-and-catboost-models/model_cb_{target}.pkl'):
#         with open(f'../input/mlb-lgbm-and-catboost-models/model_cb_{stage}_{version}_{target}.pkl', 'rb') as fin:
#             model_cb = pickle.load(fin)
#     else:
    
    model_cb = CatBoostRegressor(
                n_estimators=2000,
                learning_rate=0.05,
                loss_function='MAE',
                eval_metric='MAE',
                max_bin=50,
                subsample=0.9,
                colsample_bylevel=0.5,
                verbose=100)

    model_cb.fit(x_train, y_train, use_best_model=True,
                     eval_set=(x_valid, y_valid),
                     early_stopping_rounds=25)

    with open(f'model_cb_{stage}_{version}_{target}.pkl', 'wb') as handle:
        pickle.dump(model_cb, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    oof_pred_cat = model_cb.predict(x_valid)
    score_cat = mean_absolute_error(oof_pred_cat, y_valid)
    print('mae:', score_cat)
    
    return oof_pred_lgb, model, oof_pred_cat, model_cb, score_lgb, score_cat


# training lightgbm
params = {
'boosting_type': 'gbdt',
'objective':'mae',
'subsample': 0.5,
'subsample_freq': 1,
'learning_rate': 0.03,
'num_leaves': 2**11-1,
'min_data_in_leaf': 2**12-1,
'feature_fraction': 0.5,
'max_bin': 100,
'n_estimators': 2500,
'boost_from_average': False,
"random_seed":42,
}

oof_pred_lgb2, model_lgb2, oof_pred_cat2, model_cb2, score_lgb2, score_cat2 = fit_lgbm(
    x_train1, y_train1['target2'],
    x_valid1, y_valid1['target2'],
    2, params
)

oof_pred_lgb1, model_lgb1, oof_pred_cat1, model_cb1, score_lgb1, score_cat1 = fit_lgbm(
    x_train1, y_train1['target1'],
    x_valid1, y_valid1['target1'],
    1, params
)

oof_pred_lgb3, model_lgb3, oof_pred_cat3, model_cb3, score_lgb3, score_cat3 = fit_lgbm(
    x_train1, y_train1['target3'],
    x_valid1, y_valid1['target3'],
    3, params
)
oof_pred_lgb4, model_lgb4, oof_pred_cat4, model_cb4, score_lgb4, score_cat4= fit_lgbm(
    x_train1, y_train1['target4'],
    x_valid1, y_valid1['target4'],
    4, params
)

score = (score_lgb1+score_lgb2+score_lgb3+score_lgb4) / 4
print(f'LightGBM score: {score}')

score = (score_cat1+score_cat2+score_cat3+score_cat4) / 4
print(f'Catboost score: {score}')

In [None]:
# stage="stage2"

# with open(f'../input/{modelsver}/model_lgb_{stage}_{version}_1.pkl', 'rb') as handle:
#     model_lgb1=pickle.load(handle)
# with open(f'../input/{modelsver}/model_cb_{stage}_{version}_1.pkl', 'rb') as handle:
#     model_cb1=pickle.load(handle)
# with open(f'../input/{modelsver}/model_lgb_{stage}_{version}_2.pkl', 'rb') as handle:
#     model_lgb2=pickle.load(handle)
# with open(f'../input/{modelsver}/model_cb_{stage}_{version}_2.pkl', 'rb') as handle:
#     model_cb2=pickle.load(handle)
    
# with open(f'../input/{modelsver}/model_lgb_{stage}_{version}_3.pkl', 'rb') as handle:
#     model_lgb3=pickle.load(handle)
# with open(f'../input/{modelsver}/model_cb_{stage}_{version}_3.pkl', 'rb') as handle:
#     model_cb3=pickle.load(handle)
    
# with open(f'../input/{modelsver}/model_lgb_{stage}_{version}_4.pkl', 'rb') as handle:
#     model_lgb4=pickle.load(handle)
# with open(f'../input/{modelsver}/model_cb_{stage}_{version}_4.pkl', 'rb') as handle:
#     model_cb4=pickle.load(handle)

## Inference

In [None]:
del x_train1
del y_train1
del x_valid1
del y_valid1


In [None]:
players_cols = ['playerId', 'primaryPositionName', "mlbDebutDate","DOB"]
rosters_cols = ['playerId', 'teamId', 'status']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances']

null = np.nan
true = True
false = False

# 推論

In [None]:
# del train

gc.collect()

In [None]:
import mlb

In [None]:
import copy

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set
gc.collect()

In [None]:
for (test_df, sample_prediction_df) in iter_test: # make predictions here
    print("0",test_first.shape)
    
    sub = copy.deepcopy(sample_prediction_df.reset_index())
    sample_prediction_df = copy.deepcopy(sample_prediction_df.reset_index(drop=True))
    
    # LGBM summit
    # creat dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    sample_prediction_df['date'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[0]))
    sample_prediction_df['date']=(pd.to_datetime(sample_prediction_df['date'], format='%Y%m%d') + timedelta(days=-1)).map(lambda x:str(x.year) + str(x.month).zfill(2)+ str(x.day).zfill(2)).astype(int)
    print("日付:", sample_prediction_df['date'].iloc[0])
    
    # Dealing with missing values
    if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    else:
        test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    else:
        test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan    
    test_scores = test_scores.groupby('playerId').sum().reset_index()
    test = sample_prediction_df[['playerId', "date"]].copy()
    test = test.merge(players[players_cols], on='playerId', how='left')
    test = test.merge(test_rosters[rosters_cols], on='playerId', how='left')
    test = test.merge(test_scores[scores_cols], on='playerId', how='left')
    test = test.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])
    test = test.merge(teams, how='left', left_on=["teamId"],right_on=["id"])
    test["year_month"] = pd.to_datetime(test['date'], format="%Y%m%d").map(lambda x:str(x.year) + str(x.month).zfill(2)).astype(int)
#     test = test.merge(twitter[["date", "playerId","numberOfFollowers"]], how='left', left_on=["playerId", "year_month"],right_on=["playerId", "date"])
    test=test.rename(columns={"date_x": "date"})
    
    test = test.merge(games_schdule[['Tm', "before_date", "isgame_schedule"]],left_on = ['date','abbreviation'],right_on = ['before_date','Tm'], how = 'left')
    test=test.rename(columns={"isgame_schedule": "isgame_schedule_tom"})
    test = test.drop(["Tm", "before_date"], axis=1)
    test = test.merge(games_schdule[['Tm', "date", "isgame_schedule"]],left_on = ['date','abbreviation'],right_on = ['date','Tm'], how = 'left')

    test['label_playerId'] = test['playerId'].map(player2num)
    test['label_primaryPositionName'] = test['primaryPositionName'].map(position2num)
    test['label_teamId'] = test['teamId'].map(teamid2num)
    test['label_status'] = test['status'].map(status2num)

    test['HappyBirthDay'] = test['date'].map(lambda x: str(x)[-4:]) == (pd.to_datetime(test["DOB"], format='%Y-%m-%d')+ timedelta(days=-1)).map(lambda x: x.strftime('%m%d'))
    test["starter_or_not"] = test["gamesPlayedPitching"] + test["gamesStartedPitching"]
    test=reduce_mem_usage(test)
    
    
    
    ## test_trainのラベル
    test["test_train_label"] = "test"
    test_first["test_train_label"]="train"
    test['index'] = test.reset_index().index
    
    print("1",test_first.shape)
#     test_first = test_first.drop(["pitced_yeasterday", "pitched_yes"], axis=1)
    test_first=pd.concat([test, test_first], axis=0, ignore_index=True).sort_values("date")
    test_first=test_first.rename(columns={"date_x": "date"})
    test_first['pitchingGameScore'] = (40 + 2 * test_first['outsPitching']+ 1 * test_first['strikeOutsPitching']- 2 * test_first['baseOnBallsPitching']- 2 * test_first['hitsPitching']- 3 * test_first['runsPitching']- 6 * test_first['homeRunsPitching'])

    test_first["gamesStartedPitching_shift_one"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(1))
    test_first["gamesStartedPitching_shift_two"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(2))
    test_first["gamesStartedPitching_shift_three"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(3))
    test_first["gamesStartedPitching_shift_four"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(4))
    test_first["gamesStartedPitching_shift_five"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(5))
    test_first["gamesStartedPitching_shift_six"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(6))
    test_first["gamesStartedPitching_shift_seven"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(7))
    test_first["gamesStartedPitching_shift_eight"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(8))
    test_first["gamesStartedPitching_shift_nine"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(9))
    test_first["gamesStartedPitching_shift_ten"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(10))
    test_first["gamesStartedPitching_shift_ele"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(11))
    test_first["gamesStartedPitching_shift_twe"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(12))
    test_first["gamesStartedPitching_shift_thirteen"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(13))
    test_first["gamesStartedPitching_shift_fourteen"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(14))
    test_first["gamesStartedPitching_shift_fifteen"]=test_first.groupby(["playerId"])["gamesStartedPitching"].transform(lambda x:x.shift(15))
    
    test_first["target1_shift_three"]=test_first.groupby(["playerId"])["target1"].transform(lambda x:x.shift(3))
    test_first["target2_shift_three"]=test_first.groupby(["playerId"])["target2"].transform(lambda x:x.shift(3))
    test_first["target3_shift_three"]=test_first.groupby(["playerId"])["target3"].transform(lambda x:x.shift(3))
    test_first["target4_shift_three"]=test_first.groupby(["playerId"])["target4"].transform(lambda x:x.shift(3))
    test_first["target1_shift_four"]=test_first.groupby(["playerId"])["target1"].transform(lambda x:x.shift(4))
    test_first["target2_shift_four"]=test_first.groupby(["playerId"])["target2"].transform(lambda x:x.shift(4))
    test_first["target3_shift_four"]=test_first.groupby(["playerId"])["target3"].transform(lambda x:x.shift(4))
    test_first["target4_shift_four"]=test_first.groupby(["playerId"])["target4"].transform(lambda x:x.shift(4))
    test_first["target1_shift_five"]=test_first.groupby(["playerId"])["target1"].transform(lambda x:x.shift(5))
    test_first["target2_shift_five"]=test_first.groupby(["playerId"])["target2"].transform(lambda x:x.shift(5))
    test_first["target3_shift_five"]=test_first.groupby(["playerId"])["target3"].transform(lambda x:x.shift(5))
    test_first["target4_shift_five"]=test_first.groupby(["playerId"])["target4"].transform(lambda x:x.shift(5))
    test_first["target1_shift_six"]=test_first.groupby(["playerId"])["target1"].transform(lambda x:x.shift(6))
    test_first["target2_shift_six"]=test_first.groupby(["playerId"])["target2"].transform(lambda x:x.shift(6))
    test_first["target3_shift_six"]=test_first.groupby(["playerId"])["target3"].transform(lambda x:x.shift(6))
    test_first["target4_shift_six"]=test_first.groupby(["playerId"])["target4"].transform(lambda x:x.shift(6))
    test_first["target1_shift_seven"]=test_first.groupby(["playerId"])["target1"].transform(lambda x:x.shift(7))
    test_first["target2_shift_seven"]=test_first.groupby(["playerId"])["target2"].transform(lambda x:x.shift(7))
    test_first["target3_shift_seven"]=test_first.groupby(["playerId"])["target3"].transform(lambda x:x.shift(7))
    test_first["target4_shift_seven"]=test_first.groupby(["playerId"])["target4"].transform(lambda x:x.shift(7))

    test_first["batter_contribution"]=test_first["totalBases"]+test_first["baseOnBalls"]+test_first["intentionalWalks"]+test_first["hitByPitch"]+test_first["stolenBases"]+\
    test_first["sacBunts"]*0.5+test_first["sacFlies"]*0.5+test_first["runsScored"]*0.25+test_first["rbi"]*0.25-\
    test_first["groundIntoDoublePlay"]-test_first["caughtStealing"]
                                       
                                       
    test_first["pitchingGameScore_rolling_mean_t7"]=test_first.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
    test_first["pitchingGameScore_rolling_mean_t14"]=test_first.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(14, min_periods=2).mean())
    test_first["pitchingGameScore_rolling_mean_t28"]=test_first.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(28, min_periods=3).mean())
    test_first["pitchingGameScore_rolling_std_t7"]=test_first.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(7, min_periods=1).std())
    test_first["pitchingGameScore_rolling_std_t14"]=test_first.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(14, min_periods=2).std())
    test_first["pitchingGameScore_rolling_std_t28"]=test_first.groupby(["playerId"])["pitchingGameScore"].transform(lambda x: x.shift(1).rolling(28, min_periods=3).std())

    test_first["batter_contribution_rolling_mean_t3"]=test_first.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(3, min_periods=2).mean())
    test_first["batter_contribution_rolling_mean_t5"]=test_first.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(5, min_periods=3).mean())
    test_first["batter_contribution_rolling_mean_t7"]=test_first.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(7, min_periods=5).mean())
    test_first["batter_contribution_rolling_mean_t10"]=test_first.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(10, min_periods=6).mean())
    test_first["batter_contribution_rolling_mean_t15"]=test_first.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(15, min_periods=7).mean())

    test_first["batter_contribution_rolling_std_t3"]=test_first.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(3, min_periods=2).std())
    test_first["batter_contribution_rolling_std_t5"]=test_first.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(5, min_periods=3).std())
    test_first["batter_contribution_rolling_std_t7"]=test_first.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(7, min_periods=5).std())
    test_first["batter_contribution_rolling_std_t10"]=test_first.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(10, min_periods=6).std())
    test_first["batter_contribution_rolling_std_t15"]=test_first.groupby(["playerId"])["batter_contribution"].transform(lambda x: x.shift(1).rolling(15, min_periods=7).std())##新規追加コード
    
    test_first["stater_or_not_mean_t7"]=test_first.groupby(["playerId"])["starter_or_not"].transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
    test_first["stater_or_not_mean_t14"]=test_first.groupby(["playerId"])["starter_or_not"].transform(lambda x: x.shift(1).rolling(14, min_periods=2).mean())
    test_first["stater_or_not_mean_t28"]=test_first.groupby(["playerId"])["starter_or_not"].transform(lambda x: x.shift(1).rolling(28, min_periods=4).mean())

    test = test_first[test_first.test_train_label =="test"].sort_values("index")
    test_first = test_first[test_first.test_train_label =="train"]
    test_X = test[feature_cols]
    # predict
    pred1 = model1.predict(test_X)
    
    # predict
    pred_lgd1 = model_lgb1.predict(test_X)
    pred_lgd2 = model_lgb2.predict(test_X)
    pred_lgd3 = model_lgb3.predict(test_X)
    pred_lgd4 = model_lgb4.predict(test_X)
    
    pred_cat1 = model_cb1.predict(test_X)
    pred_cat2 = model_cb2.predict(test_X)
    pred_cat3 = model_cb3.predict(test_X)
    pred_cat4 = model_cb4.predict(test_X)
    
    test['target1'] = np.clip(pred1,0,100)
    test_X = test[feature_cols2]

    pred2 = model2.predict(test_X)
    pred3 = model3.predict(test_X)
    pred4 = model4.predict(test_X)
    
    # merge submission
    sample_prediction_df['target1'] = 1.00*np.clip(pred1, 0, 100)+0.00*np.clip(pred_lgd1, 0, 100)+0.00*np.clip(pred_cat1, 0, 100)
    sample_prediction_df['target2'] = 0.10*np.clip(pred2, 0, 100)+0.65*np.clip(pred_lgd2, 0, 100)+0.25*np.clip(pred_cat2, 0, 100)
    sample_prediction_df['target3'] = 0.65*np.clip(pred3, 0, 100)+0.25*np.clip(pred_lgd3, 0, 100)+0.10*np.clip(pred_cat3, 0, 100)
    sample_prediction_df['target4'] = 0.65*np.clip(pred4, 0, 100)+0.25*np.clip(pred_lgd4, 0, 100)+0.10*np.clip(pred_cat4, 0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0.)
    
    
    del sample_prediction_df['playerId']
    del sample_prediction_df['date']

    # TF summit
    # Features computation at Evaluation Date
#     sub_fe, eval_dt = test_lag(sub)
#     sub_fe = sub_fe.merge(LAST_MED_DF, on="playerId", how="left")
#     sub_fe = sub_fe.fillna(0.)
    
#     _preds = 0.
#     for reg in nets:
#         _preds += reg.predict(sub_fe[FECOLS + MEDCOLS]) / NFOLDS
#     sub_fe[TGTCOLS] = np.clip(_preds, 0, 100)
#     sub.drop(["date"]+TGTCOLS, axis=1, inplace=True)
#     sub = sub.merge(sub_fe[["playerId"]+TGTCOLS], on="playerId", how="left")
#     sub.drop("playerId", axis=1, inplace=True)
#     sub = sub.fillna(0.)
#     # Blending
    blend = pd.concat(
        [sub[['date_playerId']],
        (0*sub.drop(['date_playerId',"date"], axis=1) + 1*sample_prediction_df.drop('date_playerId', axis=1))],
        axis=1
    )
    env.predict(blend)
#     print(env.maes)
#     print(env.get_pb_score())
    
    test['target1']=blend['target1']
    test['target2']=blend['target2']
    test['target3']=blend['target3']
    test['target4']=blend['target4']
    test_first=pd.concat([test, test_first], axis=0, ignore_index=True).sort_values("date")
    
#     # Update Available information
#     sub_fe["EvalDate"] = eval_dt
#     #sub_fe.drop(MEDCOLS, axis=1, inplace=True)
#     LAST = LAST.append(sub_fe)
#     LAST = LAST.drop_duplicates(subset=["EvalDate","playerId"], keep="last")

In [None]:
sample_prediction_df

<div class="alert alert-success">  
</div>