## Training

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
import lightgbm as lgbm
from catboost import CatBoostRegressor
import mlb
import os
import pickle
import gc

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader,Dataset
from torch.autograd import Variable
import torch.optim as optim

import random
import warnings
warnings.filterwarnings('ignore')

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [None]:
BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
TRAIN_DIR = Path('../input/mlb-df-files')

In [None]:
players = pd.read_csv(BASE_DIR / 'players.csv')

rosters = pd.read_pickle(TRAIN_DIR / 'rosters_train.pkl')
targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')
scores = pd.read_pickle(TRAIN_DIR / 'playerBoxScores_train.pkl')
scores = scores.groupby(['playerId', 'date']).sum().reset_index()

In [None]:
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status', 'date']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'date']

feature_cols = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'target1_mean', 'target1_median',
        'target1_min','target1_max', 'target1_prob', 'target2_mean',
                'target2_median','target2_min','target2_max','target2_prob',
        'target3_mean','target3_median','target3_min',
        'target3_max','target3_prob','target4_mean','target4_median',
        'target4_min','target4_max','target4_prob']

In [None]:
player_target_stats  = pd.read_pickle(TRAIN_DIR / 'player_target_stats.pkl')
print(player_target_stats.columns)

In [None]:
targets['year'] = pd.DatetimeIndex(targets['engagementMetricsDate']).year

In [None]:
player_target_stats['year'] = player_target_stats['year'].astype('int')

In [None]:
player_target_stats['year'].head()

In [None]:
# creat dataset
train = targets[targets_cols+['year']].merge(players[players_cols], on=['playerId'], how='left')
train = train.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')
train = train.merge(scores[scores_cols], on=['playerId', 'date'], how='left')
train = train.merge(player_target_stats, on=['playerId', 'year'], how='left')


# label encoding
player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
status2num = {c: i for i, c in enumerate(train['status'].unique())}
train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['label_teamId'] = train['teamId'].map(teamid2num)
train['label_status'] = train['status'].map(status2num)

In [None]:
def create_lag_features(dt, target, lags_list):
    lag_columns = []
    tmp_columns = []
    for lag, window in tqdm(lags_list, leave=False, desc='Lag features'):

        if window == 1:
            lag_col = f"{target}_lag_{lag}"
            print(lag_col, end=', ')
            dt[lag_col] = dt[["playerId", target]].groupby("playerId")[target].shift(lag).astype('float32')
            lag_columns.append(lag_col)
            
        elif window > 1:
            lag_col = f"{target}_lag_{lag}"
            if lag_col not in dt.columns:
                dt[lag_col] = dt[["playerId", target]].groupby("playerId")[target].shift(lag).astype('float32')
                tmp_columns.append(lag_col)

            rmean_col = f"{target}_rmean_{lag}_{window}"
            lag_columns.append(rmean_col)
            print(rmean_col, end=', ')
            dt[rmean_col] = dt[["playerId", lag_col]].groupby("playerId")[lag_col]. \
                transform(lambda x: x.rolling(window).mean()).astype('float32')
            
            rstd_col = f"{target}_rstd_{lag}_{window}"
            lag_columns.append(rstd_col)
            print(rstd_col, end=', ')
            dt[rstd_col] = dt[["playerId", lag_col]].groupby("playerId")[lag_col]. \
                transform(lambda x: x.rolling(window).std()).astype('float32')
            
            rmin_col = f"{target}_rmin_{lag}_{window}"
            lag_columns.append(rmin_col)
            print(rmin_col, end=', ')
            dt[rmin_col] = dt[["playerId", lag_col]].groupby("playerId")[lag_col]. \
                transform(lambda x: x.rolling(window).min()).astype('float32')
            
            rmax_col = f"{target}_rmax_{lag}_{window}"
            lag_columns.append(rmax_col)
            print(rmax_col, end=', ')
            dt[rmax_col] = dt[["playerId", lag_col]].groupby("playerId")[lag_col]. \
                transform(lambda x: x.rolling(window).max()).astype('float32')

    print('dropping tmp cols:', tmp_columns)
    dt.drop(tmp_columns, axis=1, inplace=True)
    return dt, lag_columns

In [None]:
# lags_list = [[1, 30], [7, 30], [14, 30]]
# lag_cols = []
# target_cols = ['target1', 'target2', 'target3', 'target4']
# for target in target_cols:
#     train, lag_columns = create_lag_features(train, target, lags_list)
#     lag_cols = lag_columns+lag_cols

In [None]:
train.shape

In [None]:
train_X = train[feature_cols]
train_y = train[['target1', 'target2', 'target3', 'target4']]

_index = (train['date'] < 20210401)
x_train1 = train_X.loc[_index].reset_index(drop=True)
y_train1 = train_y.loc[_index].reset_index(drop=True)
x_valid1 = train_X.loc[~_index].reset_index(drop=True)
y_valid1 = train_y.loc[~_index].reset_index(drop=True)

In [None]:
del train_X, train_y
gc.collect()

In [None]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, target, model_path, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    
    if os.path.isfile(f'{model_path}/model_lgb_{target}.pkl'):
        with open(f'{model_path}/model_lgb_{target}.pkl', 'rb') as fin:
            model = pickle.load(fin)
    else:
    
        model = lgbm.LGBMRegressor(**params)
        model.fit(x_train, y_train, 
            eval_set=[(x_valid, y_valid)],  
            early_stopping_rounds=verbose, 
            verbose=verbose)
        
        with open(f'model_lgb_{target}.pkl', 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score


# training lightgbm

params = {
 'objective':'mae',
 'reg_alpha': 0.1,
 'reg_lambda': 0.1, 
 'n_estimators': 2000,
 'learning_rate': 0.02,
 'random_state': 42,
 "num_leaves": 100
}

model_path = '../input/mlb-final-models'

lgb_models = {20210301: [], 20210401: [], 20210501:[], 20210601:[], 20210701:[]}

for cut_date in [20210301, 20210401, 20210501, 20210601, 20210701]:
    oof1, model1, score1 = fit_lgbm(
        x_train1, y_train1['target1'],
        x_valid1, y_valid1['target1'],
        f'target1_{cut_date}', model_path, params
     )
    
    lgb_models[cut_date].append(model1)
    
    oof2, model2, score2 = fit_lgbm(
        x_train1, y_train1['target2'],
        x_valid1, y_valid1['target2'],
        f'target2_{cut_date}', model_path, params
    )
    
    lgb_models[cut_date].append(model2)
    
    oof3, model3, score3 = fit_lgbm(
        x_train1, y_train1['target3'],
        x_valid1, y_valid1['target3'],
       f'target3_{cut_date}', model_path, params
    )
    
    lgb_models[cut_date].append(model3)
    
    oof4, model4, score4 = fit_lgbm(
        x_train1, y_train1['target4'],
        x_valid1, y_valid1['target4'],
        f'target4_{cut_date}', model_path, params
    )
    
    lgb_models[cut_date].append(model4)
    
    score = (score1+score2+score3+score4) / 4
    print(f'score: {score}')

In [None]:
del oof1, oof2, oof3, oof4
gc.collect()

# Neural Net

In [None]:
def make_2Dinput(dt, cont_cols, cat_cols):
    input = {"rnn": dt[cont_cols].to_numpy()}
    for i, v in enumerate(cat_cols):
        input[v] = dt[[v]].to_numpy()
    return input

class MLBLoader:

    def __init__(self, X, y, shuffle=True, batch_size=1000, cat_cols=[]):
        self.X_cont = X["rnn"]
        try:
            self.X_cat = np.concatenate([X[k] for k in cat_cols], axis=1)
        except:
            self.X_cat = np.concatenate([np.expand_dims(X[k], axis=1) for k in cat_cols], axis=1)
        self.y = y

        self.shuffle = shuffle
        self.batch_size = batch_size
        self.n_conts = self.X_cont.shape[1]
        self.len = self.X_cont.shape[0]
        n_batches, remainder = divmod(self.len, self.batch_size)

        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches
        self.remainder = remainder  # for debugging

        self.idxes = np.array([i for i in range(self.len)])

    def __iter__(self):
        self.i = 0
        if self.shuffle:
            ridxes = self.idxes
            np.random.shuffle(ridxes)
            self.X_cat = self.X_cat[[ridxes]]
            self.X_cont = self.X_cont[[ridxes]]
            if self.y is not None:
                self.y = self.y[[ridxes]]

        return self

    def __next__(self):
        if self.i >= self.len:
            raise StopIteration

        if self.y is not None:
            y = torch.FloatTensor(self.y[self.i:self.i + self.batch_size].astype(np.float32))

        else:
            y = None

        xcont = torch.FloatTensor(self.X_cont[self.i:self.i + self.batch_size])
        xcat = torch.LongTensor(self.X_cat[self.i:self.i + self.batch_size])

        batch = (xcont, xcat, y)
        self.i += self.batch_size
        return batch

    def __len__(self):
        return self.n_batches
    
    
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

#XAVIER INITILIZATION
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)

####### Simple MLP model for 2D input ############################

class MLB_NN(nn.Module):
    def __init__(self, emb_dims, n_cont, hidden_dim, device=DEVICE):
        super().__init__()
        self.device = device

        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])
        n_embs = sum([y for x, y in emb_dims])

        n_embs = n_embs
        n_cont = n_cont
        inp_dim = n_cont + n_embs

        # HIDDEN LAYERS
        self.fc0 = nn.Linear(inp_dim, hidden_dim)
        #self.drop0 = nn.Dropout(0.1)
        self.fc1 = nn.Linear(hidden_dim, int(hidden_dim))
        #self.drop1 = nn.Dropout(0.1)
        self.fc2 = nn.Linear(int(hidden_dim), int(hidden_dim))
        self.fc3 = nn.Linear(int(hidden_dim), 4)


        # apply initilizations
        self.fc0.apply(init_weights)
        self.fc1.apply(init_weights)
        self.fc2.apply(init_weights)
        self.fc3.apply(init_weights)


    # train embedding layers and concat cat and cont variables
    def encode_and_combine_data(self, cont_data, cat_data):
        xcat = [el(cat_data[:, k]) for k, el in enumerate(self.emb_layers)]
        xcat = torch.cat(xcat, 1)
        x = torch.cat([xcat, cont_data], 1)
        return x

    def forward(self, cont_data, cat_data):
        cont_data = cont_data.to(self.device)
        cat_data = cat_data.to(self.device)
        x = self.encode_and_combine_data(cont_data, cat_data)

        hz = F.leaky_relu(self.fc0(x))
        #hz = self.drop0(hz)
        hz = F.leaky_relu(self.fc1(hz))
        #hz = self.drop1(hz)
        hz = F.leaky_relu(self.fc2(hz))
        out = self.fc3(hz)

        return out
    
def training_nn(x_train, y_train, x_valid, y_valid, cont_cols, cat_cols, 
                model_nn, model_path, epoch=50, patience=5, hidden_dim=128, device='cpu'):

    uniques = {}
    for i, v in enumerate(cat_cols):
        uniques[v] = len(np.unique(list(x_train[v].unique())+list(x_valid[v].unique())))


    pred_val = []
    true_y = []

    
    #Make input for pytorch loader because we have categorical and continues features
    x_train = make_2Dinput(x_train, cont_cols=cont_cols, cat_cols=cat_cols)
    x_valid = make_2Dinput(x_valid, cont_cols=cont_cols, cat_cols=cat_cols)

    #Make loader for pytorch
    train_loader = MLBLoader(x_train, y_train.values, cat_cols=cat_cols, batch_size=1024, shuffle=True)
    val_loader = MLBLoader(x_valid, y_valid.values, cat_cols=cat_cols, batch_size=1024, shuffle=False)
    
     ## make embedding dimensions
    
    dims = [32, 32, 32, 32, 32, 32, 32]
    emb_dims = [(uniques[col], y) for col, y in zip(cat_cols, dims)]

    # number of continues variables
    n_cont = train_loader.n_conts
    
    if os.path.isfile(model_path):
    
        #best_model = model_nn(emb_dims=emb_dims, n_cont=n_cont, hidden_dim=hidden_dim).to(device)
        best_model = torch.load(model_path, map_location=device).to(device)
        print('done load model')
        # Validation phase for single epoch
        phase='Valid'
        with torch.no_grad():
            best_model.eval()
            y_true = []
            y_pred = []

            for i, (X_cont, X_cat, y) in enumerate(tqdm(val_loader)):
                out = best_model(X_cont.to(device), X_cat.to(device))                
                y_pred.append(out) 
                y_true.append(y)
            
            print('done predict')
            y_pred = torch.cat(y_pred, dim=0).detach().cpu().numpy()
            y_true = torch.cat(y_true, dim=0).detach().cpu().numpy()

            best_rmse = 0
            for c in range(4):
                best_rmse += mean_absolute_error(y_true[:, c], y_pred[:, c])/4
                print(f'target{c}', mean_absolute_error(y_true[:, c], y_pred[:, c]))

            print(f" Val MAE: {best_rmse:.4f} ")
    else:
        
        #neural network model
        model = model_nn(emb_dims=emb_dims, n_cont=n_cont, hidden_dim=hidden_dim).to(device)
        
        #loss function
        criterion = nn.L1Loss()

        #adam optimizer has been used for training
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

        #learning rate scheduler
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
                                                  max_lr=1e-2, epochs=epoch, steps_per_epoch=len(train_loader))


        best_rmse=np.inf
        counter=0
        for ep in range(epoch):
            train_loss, val_loss = 0, 0

            #training phase for single epoch
            model.train()
            for i, (X_cont, X_cat, y) in enumerate(tqdm(train_loader)):

                optimizer.zero_grad()
                out= model(X_cont.to(device), X_cat.to(device))
                
                loss = 0
                
                for c in range(4):
                    loss = loss + criterion(out[:, c], y[:, c].to(device))
                    
                loss.backward()

                optimizer.step()
                scheduler.step()

                with torch.no_grad():
                    train_loss += loss.item() / len(train_loader)

            # Validation phase for single epoch
            phase='Valid'
            with torch.no_grad():
                model.eval()
                y_true = []
                y_pred = []
                
                for i, (X_cont, X_cat, y) in enumerate(tqdm(val_loader)):
                    out = model(X_cont.to(device), X_cat.to(device))
                    y_pred.append(out) 
                    y_true.append(y)
                
                y_pred = torch.cat(y_pred, dim=0).detach().cpu().numpy()
                y_true = torch.cat(y_true, dim=0).detach().cpu().numpy()
                
                rmse = 0
                for c in range(4):
                    rmse += mean_absolute_error(y_true[:, c], y_pred[:, c])/4
                    
                print(f"[{phase}] Epoch: {ep} | Tain loss: {train_loss:.4f} | Val MAE: {rmse:.4f} ")

                if best_rmse > rmse:
                    best_rmse = rmse
                    best_model = model
                    torch.save(best_model, model_path)
                    counter = 0
                else:
                    counter = counter + 1

            #early stopping 
            if counter>=patience:
                print("Early stopping")
                break

    return best_model, best_rmse


In [None]:
cat_cols = ['label_playerId', 'label_primaryPositionName', 
            'label_teamId', 'label_status']

cont_cols = feature_cols
cont_cols = [f for f in cont_cols if f not in cat_cols]

for col in cont_cols:
    x_train1[col] = x_train1[col].fillna(0)
    x_valid1[col] = x_valid1[col].fillna(0)

    
model_nets = []
for cut_date in [20210301, 20210401, 20210501, 20210601, 20210701]:
    model_net, score_nn = training_nn(x_train1, y_train1, x_valid1, y_valid1, cont_cols, 
                                      cat_cols, MLB_NN, model_path=f'../input/mlb-final-models/model_nn_{cut_date}.pth', 
                                      epoch=50, patience=5, hidden_dim=256, device=DEVICE)

    model_nets.append(model_net)

In [None]:
gc.collect()

In [None]:
del x_train1, y_train1, x_valid1, y_valid1
gc.collect()

In [None]:
train = train.loc[train['date']>20210101]

In [None]:
gc.collect()

# Inference

In [None]:
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances']

In [None]:
player_target_stats=player_target_stats.loc[player_target_stats['year']==2021]

In [None]:
null = np.nan
true = True
false = False

weights = {20210301: 0.15, 20210401: 0.15, 20210501:0.2, 20210601:0.25, 20210701:0.25}
cut_dates = [20210301, 20210401, 20210501, 20210601, 20210701]

In [None]:
import copy

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test: # make predictions here
    
    sub = copy.deepcopy(sample_prediction_df.reset_index())
    sample_prediction_df = copy.deepcopy(sample_prediction_df.reset_index(drop=True))
    
    # LGBM summit
    # creat dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    # Dealing with missing values
    if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    else:
        test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    else:
        test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan
    
    test_scores = test_scores.groupby('playerId').sum().reset_index()
    test = sample_prediction_df[['playerId', 'target1', 'target2', 'target3', 'target4']].copy()
    test = test.merge(players[players_cols], on='playerId', how='left')
    test = test.merge(test_rosters[rosters_cols], on='playerId', how='left')
    test = test.merge(test_scores[scores_cols], on='playerId', how='left')
    test = test.merge(player_target_stats, on='playerId', how='left')
  
    
    test['label_playerId'] = test['playerId'].map(player2num)
    test['label_primaryPositionName'] = test['primaryPositionName'].map(position2num)
    test['label_teamId'] = test['teamId'].map(teamid2num)
    test['label_status'] = test['status'].map(status2num)
   

    test_X = test[feature_cols]
    
    pred1, pred2, pred3, pred4 = 0, 0, 0, 0
    for cut_date in cut_dates:
        pred1 += weights[cut_date]*lgb_models[cut_date][0].predict(test_X)
        pred2 += weights[cut_date]*lgb_models[cut_date][1].predict(test_X)
        pred3 += weights[cut_date]*lgb_models[cut_date][2].predict(test_X)
        pred4 += weights[cut_date]*lgb_models[cut_date][3].predict(test_X)
    
    
    test_X=test_X.fillna(0)
    
    xcont = torch.FloatTensor(test_X[cont_cols].values)
    xcat = torch.LongTensor(test_X[cat_cols].values)
    
    pred_nn1, pred_nn2, pred_nn3, pred_nn4 = 0, 0, 0, 0
    
    for cute_date, model_net in zip(cut_dates, model_nets): 
        pred_nn = model_net(xcont.to(DEVICE), xcat.to(DEVICE))
        pred_nn = pred_nn.detach().cpu().numpy()
        
        pred_nn1 += pred_nn[:, 0]*weights[cut_date]
        pred_nn2 += pred_nn[:, 1]*weights[cut_date]
        pred_nn3 += pred_nn[:, 2]*weights[cut_date]
        pred_nn4 += pred_nn[:, 3]*weights[cut_date]

    
    # merge submission
    sample_prediction_df['target1'] = np.clip(0.50*pred1 + 0.50*pred_nn1, 0, 100)
    sample_prediction_df['target2'] = np.clip(0.50*pred2 + 0.50*pred_nn2, 0, 100)
    sample_prediction_df['target3'] = np.clip(0.50*pred3 + 0.50*pred_nn3, 0, 100)
    sample_prediction_df['target4'] = np.clip(0.50*pred4 + 0.50*pred_nn4, 0, 100)

    sample_prediction_df = sample_prediction_df.fillna(0.)
    del sample_prediction_df['playerId']
    
    env.predict(sample_prediction_df)


In [None]:
sample_prediction_df