In [None]:
import gc
import sys
import warnings
from joblib import Parallel, delayed
from pathlib import Path

import ipywidgets as widgets
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import numpy as np
import pandas as pd
import seaborn as sns
import time

global_time = time.time()

warnings.simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)

In [None]:
# Helper function to unpack json found in daily data
def unpack_json(json_str):
    return pd.DataFrame() if pd.isna(json_str) else pd.read_json(json_str)


def unpack_data(data, dfs=None, n_jobs=-1):
    if dfs is not None:
        data = data.loc[:, dfs]
    unnested_dfs = {}
    for name, column in data.iteritems():
        daily_dfs = Parallel(n_jobs=n_jobs)(
            delayed(unpack_json)(item) for date, item in column.iteritems())
        df = pd.concat(daily_dfs)
        unnested_dfs[name] = df
    return unnested_dfs

In [None]:
data_dir = Path('../input/mlb-player-digital-engagement-forecasting/')

df_names = ['seasons', 'teams', 'players', 'awards']

for name in df_names:
    globals()[name] = pd.read_csv(data_dir / f"{name}.csv")

kaggle_data_tabs = widgets.Tab()
# Add Output widgets for each pandas DF as tabs' children
kaggle_data_tabs.children = list([widgets.Output() for df_name in df_names])



# display(kaggle_data_tabs)

In [None]:
import time
global_time = time.time()
max_sec = 5*3600 

In [None]:
%%time
# Define dataframes to load from training set
dfs = [
    'nextDayPlayerEngagement',  # targets
    'playerBoxScores',  # features
    # Other dataframes available for features:
    'games',
    'rosters',
    # 'teamBoxScores',
    # 'transactions',
    # 'standings',
    # 'awards',
    # 'events',
    'playerTwitterFollowers',
    # 'teamTwitterFollowers',
]

# Read training data
training = pd.read_csv(
    data_dir / 'train_updated.csv',
    usecols=['date'] + dfs,
)

# Convert training data date field to datetime type
training['date'] = pd.to_datetime(training['date'], format="%Y%m%d")
training = training.set_index('date').to_period('D')
# print(training.info())

In [None]:
%time
# Unpack nested dataframes and store in dictionary `training_dfs`
training_dfs = unpack_data(training, dfs=dfs)
print('\n', training_dfs.keys())

In [None]:
def add_agg_feats(df,infer=True):
    f = 1
    if infer:
        f = 0.78
    df['num_games'] = f*df.groupby('gameDate')['home'].transform(lambda x: 1 - (x!=x).mean()).fillna(0)
    # df['num_runs']  = f2*df.groupby('gameDate')['homeRuns'].transform('mean')
    return df

In [None]:
info_feats = ['gameDate','playerId']
cat_feats_base = ['positionCode','teamId','statusCode','home']
cat_feats = ['positionCode','teamId','statusCode','month']
feats = ['num_games','numberOfFollowers','diff', 'flyOuts',
        'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
        'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
        'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
        'plateAppearances', 'totalBases', 'rbi',
        'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
        'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
        'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
        'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
        'groundOutsPitching', 'runsPitching', 'doublesPitching',
        'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
        'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
        'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
        'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
        'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
        'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
        'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
        'inheritedRunnersScored', 'catchersInterferencePitching',
        'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
        'assists', 'putOuts', 'errors', 'chances']

tar_cols = ['target1','target2','target3','target4']

feat_std = {'num_games': 0.2,
     'numberOfFollowers': 2.0203003106791173,
     'diff': 4.5429028238121605,
     'flyOuts': 0.5994564164812675,
     'groundOuts': 0.880031028873754,
     'runsScored': 0.6354249280932919,
     'doubles': 0.3742551786113168,
     'triples': 0.2,
     'homeRuns': 0.32624555685689094,
     'strikeOuts': 0.8406402289572776,
     'baseOnBalls': 0.5371370684035869,
     'intentionalWalks': 0.2,
     'hits': 0.8583428342850036,
     'hitByPitch': 0.2,
     'atBats': 1.6478300549887628,
     'caughtStealing': 0.2,
     'stolenBases': 0.21019790941512004,
     'groundIntoDoublePlay': 0.23986575244480057,
     'plateAppearances': 1.7689265657741993,
     'totalBases': 1.7100020604796016,
     'rbi': 0.7721294644017079,
     'leftOnBase': 1.3728075259808412,
     'sacBunts': 0.2,
     'sacFlies': 0.2,
     'catchersInterference': 0.2,
     'pickoffs': 0.2,
     'gamesPlayedPitching': 0.2,
     'gamesStartedPitching': 0.4190423436473443,
     'completeGamesPitching': 0.2,
     'shutoutsPitching': 0.2,
     'winsPitching': 0.33595393562809456,
     'lossesPitching': 0.3351618729886152,
     'flyOutsPitching': 1.3295187236130297,
     'airOutsPitching': 2.3056747952654946,
     'groundOutsPitching': 2.3446545443246705,
     'runsPitching': 1.6076997348681283,
     'doublesPitching': 0.7169228004951924,
     'triplesPitching': 0.2,
     'homeRunsPitching': 0.6054885071757616,
     'strikeOutsPitching': 2.2404138070954094,
     'baseOnBallsPitching': 1.0345850662281675,
     'intentionalWalksPitching': 0.2,
     'hitsPitching': 2.2465570005332784,
     'hitByPitchPitching': 0.31987897336605925,
     'atBatsPitching': 7.387801783173313,
     'caughtStealingPitching': 0.203205178873725,
     'stolenBasesPitching': 0.38069606988055466,
     'inningsPitched': 1.9883795796409067,
     'saveOpportunities': 0.28274591448900555,
     'earnedRuns': 1.5271776986732541,
     'battersFaced': 8.091299548597068,
     'outsPitching': 5.895419109629862,
     'pitchesThrown': 31.36018795374351,
     'balls': 11.643084300453825,
     'strikes': 20.21716920309324,
     'hitBatsmen': 0.31987897336605925,
     'balks': 0.2,
     'wildPitches': 0.3076736135218784,
     'pickoffsPitching': 0.2,
     'rbiPitching': 1.5291092688313006,
     'gamesFinishedPitching': 0.41729463178456044,
     'inheritedRunners': 0.7408924572465002,
     'inheritedRunnersScored': 0.4191755714981905,
     'catchersInterferencePitching': 0.2,
     'sacBuntsPitching': 0.2,
     'sacFliesPitching': 0.23749239476534356,
     'saves': 0.2515259923185149,
     'holds': 0.35140201229516216,
     'blownSaves': 0.2,
     'assists': 1.2426148963769565,
     'putOuts': 3.090758413548205,
     'errors': 0.2244738592201668,
     'chances': 3.2862113104708066}


In [None]:
from datetime import timedelta

def merge_df(test_df,dfs,sort=False,infer=True):
    test_df['gameDate'] = test_df['engagementMetricsDate'].astype('datetime64[ns]')
    if not infer:
        test_df['gameDate'] = test_df['gameDate'] - timedelta(days=1)
    dfs['playerTwitterFollowers']['gameDate'] = dfs['playerTwitterFollowers']['date'].astype('datetime64[ns]')
    for key in dfs:
        dfs[key]['gameDate'] = dfs[key]['gameDate'].astype('datetime64[ns]')
    df_train = dfs['playerBoxScores'].merge(dfs['games'],how='outer',on = ['gameDate','gamePk'])
    df_train['diff'] = (df_train['homeScore'] - df_train['awayScore']).fillna(0).values
    df_train = df_train.merge(dfs['rosters'],how='outer',on = ['gameDate','teamId','playerId'])
    df_train = df_train.merge(dfs['playerTwitterFollowers'],how='outer',on = ['gameDate','playerId'])
    
    df_train = add_agg_feats(df_train,infer)
    df_train = df_train[info_feats + cat_feats_base + feats]
    ###
    df_train = df_train.groupby(['gameDate','playerId']).head(1)
    ###
    df_train = test_df.merge(df_train,how='left',on = ['gameDate','playerId'])
    if sort:
        df_train = df_train.sort_values('gameDate')
    ### 
    df_train['numberOfFollowers'] = df_train['numberOfFollowers'].values/100000
    df_train['month'] = df_train['gameDate'].apply(lambda x: int(x.month)).values.astype('int')
    
    for f,s in feat_std.items():
        df_train[f] = df_train[f].values/s

    return df_train


df_train = merge_df(training_dfs['nextDayPlayerEngagement'],{key:df for key,df in training_dfs.items() if key!='nextDayPlayerEngagement'},sort=True,infer=False)

In [None]:
for i in range(1,5):
    df_train[f'agg_tars_{i}'] = df_train.groupby('gameDate')[tar_cols[i-1]].transform('mean').fillna(0)

In [None]:
import gc

for key in training_dfs:
    training_dfs[key] = training_dfs[key].head()
    
gc.collect()

In [None]:
mapper = {}
for cat in cat_feats:
    mapper_ = df_train[cat].unique()
    mapper_ = {n:k for k,n in enumerate(mapper_)}
    mapper[cat] = mapper_
print(mapper)

In [None]:
mapper = {'home':{np.nan:0,0:1,1:1},
        'positionCode': {np.nan: 0, 11.0: 1, 1.0: 2, 8.0: 3, 9.0: 4, 3.0: 5, 7.0: 6, 2.0: 7, 10.0: 8, 4.0: 9, 6.0: 10, 5.0: 11, 12.0: 12}, 
        'teamId': { np.nan: 0,119.0: 1, 115.0: 2, 120.0: 3, 145.0: 4, 118.0: 5, 134.0: 6, 139.0: 7, 135.0: 8, 111.0: 9, 140.0: 10, 121.0: 11, 143.0: 12, 109.0: 13, 147.0: 14, 117.0: 15, 146.0: 16, 158.0: 17, 112.0: 18, 133.0: 19, 141.0: 20, 142.0: 21, 114.0: 22, 108.0: 23, 136.0: 24, 144.0: 25, 110.0: 26, 138.0: 27, 113.0: 28, 137.0: 29, 116.0: 30, 159.0: 31, 160.0: 32}, 
        'statusCode': {np.nan: 0,'A': 1, 'D60': 2, 'RM': 3, 'D10': 4, 'D7': 5, 'PL': 6, 'SU': 7, 'FME': 8, 'BRV': 9, 'RES': 10, 'DEC': 11}, 
        'month': {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11}}

def cat_transform(df):
    for cat in cat_feats:
        map_ = mapper[cat]
        df[cat] = df[cat].map(map_).fillna(0).astype(int)
    return df

df_train = cat_transform(df_train)

NUM_CATS = [df_train[c].max()+1 for c in cat_feats]

In [None]:
df_train.reset_index(drop=True, inplace=True)
df_train.shape

In [None]:
import math
import torch
from torch.utils.data import Dataset, DataLoader
import random

NUM = 32
NUM_1 = 3

BATCH_SIZE  = 64
UNITS = 128

class MLBdata(Dataset):
    def __init__(self,data,ind,infer=False):
        self.infer = infer
        self.ind = np.arange(len(ind))[ind]
        self.data = data
        self.date = data['gameDate'].values
        self.feats = torch.from_numpy(data[feats].fillna(-1).values).float()
        self.cat_feats = data[cat_feats].values.astype('int')
        self.num_cats = NUM_CATS
        self.tars = torch.from_numpy(data[tar_cols].fillna(0).values).float()
        self.player_idx = data.groupby('playerId').groups
        self.player_idx = {g:list(l) for g,l in self.player_idx.items()}
        self.agg_tars = torch.from_numpy(data[[f'agg_tars_{k}' for k in range(1,5)]].values).float()
                
    def __len__(self):
        if not self.infer:
            return int(8*len(self.ind)/BATCH_SIZE)
        else:
            return len(self.ind)

        
    def __getitem__(self,idx):
        if not self.infer:
            idx_0 = random.choice(self.ind)
        else:
            idx_0 = self.ind[idx]
        ind = self.player_idx[self.data.loc[idx_0,'playerId']] # all ind of player
        time_Ok = False
        while not time_Ok:
            if self.infer:
                idx = ind.index(idx_0)
            else:
                idx = random.randint(NUM*NUM_1,len(ind)-1)  
            idx_1 = random.randint(1,min(len(ind),NUM))
            time_Ok = True
            
        num = NUM_1*NUM

        ind_1 = np.array(ind)[max(0,idx+1-num):idx+1] # all
        ind_2 = ind[max(0,idx+1-num):idx+1-idx_1] # tar range        
        ###
        # date = np.concatenate([np.zeros(1),np.diff(self.date[ind_1]).astype('float')/(3600*24*1e9)]).astype('int')

        ###
        feats = self.feats[ind_1]
        ###
        tar_feats = self.tars[ind_2]
        ###
        tar_feats_1 = torch.cat([tar_feats.mean(0),tar_feats.median(0)[0]],-1)
        tar_feats_1 = tar_feats_1[None,...] + 0*torch.zeros(len(feats),1)
        ###
        tar_feats = torch.cat([self.tars[ind_2],0*tar_feats[:,:1],self.agg_tars[ind_2]],-1)
        tar_pad = torch.zeros((len(feats)-len(tar_feats),9))
        tar_pad[:,4] = 1
        tar_feats = torch.cat([torch.log(1+tar_feats),tar_pad],0)
        feats = torch.cat([tar_feats,feats,tar_feats_1],-1)
        ###
        
        cat_feats = []
        for k,num in enumerate(self.num_cats):
            cat_feats.append(torch.eye(num)[self.cat_feats[ind_1,k]]) 

        cat_feats = torch.cat(cat_feats,-1)
        ###
        tars = self.tars[ind_1[-NUM:]]
        
        if len(feats) < NUM:
            d_num = NUM-len(feats)
            feats = torch.cat([torch.zeros(d_num,feats.size(1)),feats])
            cat_feats = torch.cat([torch.zeros(d_num,cat_feats.size(1)),cat_feats])
            tars = torch.cat([-torch.ones(d_num,tars.size(1)),tars])

        tars[:-idx_1,:] = -1

        return feats,cat_feats,tars
                        
x,x1,t = MLBdata(df_train,np.ones(len(df_train))==1)[10000]
NUM_F,NUM_C = x.shape[1],x1.shape[1]
len(MLBdata(df_train,np.ones(len(df_train))==1))

In [None]:
from torch import nn
import torch.nn.functional as F


class ResBlock(nn.Module):
    def __init__(self,in_units,units=None):
        super().__init__()
        
        if units is None:
            units = 2*in_units
        
        # self.norm = nn.LayerNorm(normalized_shape=in_units, eps=1e-12) # nn.BatchNorm1d(in_units)
        self.hidden = nn.Linear(in_units,units) 
        self.out = nn.Linear(units,in_units) 
    
    def forward(self,x0):
        x = F.leaky_relu(self.hidden(x0))    # self.norm(x0)))
        x = F.leaky_relu(x0 + self.out(x))
        return x
    
    
class Wave_Block(nn.Module):
    def __init__(self, in_channels=UNITS, out_channels=UNITS, dilation_rates=6, kernel_size=2):
        super(Wave_Block, self).__init__()
        self.num_rates = dilation_rates
        self.convs = nn.ModuleList()
        self.filter_convs = nn.ModuleList()
        self.gate_convs = nn.ModuleList()

        self.convs.append(nn.Conv1d(in_channels, out_channels, kernel_size=1))
        dilation_rates = [2 ** i for i in range(dilation_rates)]
        for dilation_rate in dilation_rates:
            self.filter_convs.append(
                nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size, padding=0, dilation=dilation_rate))
            self.gate_convs.append(
                nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size, padding=0, dilation=dilation_rate))
            self.convs.append(nn.Conv1d(out_channels, out_channels, kernel_size=1))

    def forward(self, x):
        b,s,u = x.size()
        x = self.convs[0](x.transpose(1,2))
        res = x
        for i in range(self.num_rates):
            x = F.leaky_relu(self.filter_convs[i](x)) * torch.sigmoid(self.gate_convs[i](x))
            x = F.leaky_relu(self.convs[i + 1](x))
            res = res[...,-x.size(-1):] + x
        res = res.transpose(1,2)

        return res
    
    
class MLB_Model(nn.Module):
    def __init__(self,units):
        super().__init__()
        
        num_c = 24
        num_process = 2
        drop_0 = .12
        drop_1 = .16
        

        self.in_lay = nn.Linear(NUM_F,NUM_F)
        self.in_lay_c = nn.Linear(NUM_C,num_c)
        
        self.merge = nn.Linear(NUM_F+num_c,units)
        self.process = nn.Sequential(*[ResBlock(units) for _ in range(num_process)])
        
        self.drop_0 = nn.Dropout(drop_0)
        self.drop_1 = nn.Dropout(drop_1)
        
        # 
        # self.wave = Wave_Block() 
        self.rnn = nn.GRU(units,units,batch_first=True) # LayerNormGRU(units,units) 
                
        self.out_1 = nn.Linear(2*units,2*units)
        self.out_2 = nn.Linear(2*units,2*units)
        self.out_3 = nn.Linear(2*units,4)
        
    def forward(self,x,x_c):
        x_0 = F.leaky_relu(self.in_lay(x))
        x_1 = F.leaky_relu(self.in_lay_c(x_c))
        x = F.leaky_relu(self.merge(torch.cat([x_0,x_1],-1)))
        
        x = self.drop_0(x)
        # b,s,u = x.size()
        x = self.process(x) # .reshape(-1,u)).reshape(b,s,u)
        ###
        # x_0 = self.wave(x)
        x_0 = self.rnn(x)[0]
        x = torch.cat([x[:,-NUM:,:],x_0[:,-NUM:,:]],-1)

        x = self.drop_1(x)
        ###
        x = F.leaky_relu(self.out_1(x))
        x = F.leaky_relu(self.out_2(x))
        x = F.relu(self.out_3(x))
        
        return x
        
        
# MLB_Model(UNITS)(x[None,],x1[None,]).shape

In [None]:

class Trainer():
    def __init__(self,data,pretrained=None):
        val_ind = df_train['gameDate'].apply(lambda x: (x.year==2021) & (x.month==5))

        train_ind = ~val_ind
        self.data_t = MLBdata(data,train_ind)
        self.data_v = MLBdata(data,val_ind,True)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = MLB_Model(UNITS).to(self.device)
        if pretrained is not None:
            self.model.load_state_dict(torch.load(pretrained))
        
    def train_step(self,batch,val=False): 
        t = batch[-1].to(self.device)
        batch = [x.to(self.device) for x in batch[:-1]]
    
        if val:
            with torch.no_grad():
                p = self.model(*batch)
            return nn.L1Loss()(p[:,-1,:],t[:,-1,:]).item(),p[:,-1,:],t[:,-1,:]
        
        self.opt.zero_grad()
        p = self.model(*batch)
        mask = t!=-1
        loss = torch.abs(p[mask]-t[mask]).mean()
        loss.backward()
        # if random.randint(0,100)==0:
        #     plot_grad_flow(self.model.named_parameters())
        
        l = loss.item()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.,norm_type='inf')
        self.opt.step()
        return l
        
    def train_one_epoch(self,loader,val=False):
        if val:
            self.infer = True
            self.preds = []
            self.tars = []
        else:
            self.infer = False
            
        losses = []
        self.iter = 0
        for b in loader:
            loss = self.train_step(b,val)
            if self.infer:
                self.preds.append(loss[1].detach().cpu().numpy())
                self.tars.append(loss[2].detach().cpu().numpy())
                loss = loss[0]
            losses.append(loss)
            if time.time() - global_time > max_sec:
                self.end = True
                break
        if self.infer:
            self.preds = np.concatenate(self.preds)
            self.tars = np.concatenate(self.tars)
        
        return losses
    
    def validate(self):
        self.model.eval()
        loader = DataLoader(self.data_v,batch_size=BATCH_SIZE)
        losses = self.train_one_epoch(loader,True)
        print('Validation loss:',np.array(losses).mean())
        self.model.train()
        return np.array(losses).mean()
    
    def train(self,epochs,lr=2e-3,end_lr=5e-4):
        self.num_epochs = epochs
        self.best_loss = 100
        self.model_paths = {}
        self.end = False
        
        fact = (end_lr/lr)**(1/(1+epochs))
        print(fact)
        self.opt = torch.optim.Adam(self.model.parameters(),lr=lr,betas=(0.9,0.95),weight_decay=6e-6)
        loader = DataLoader(self.data_t,batch_size=BATCH_SIZE,shuffle=True)
        
        for ep in range(epochs):
            losses = self.train_one_epoch(loader)
            plt.plot(losses)
            plt.show()
            print(f'loss for epoch {ep}:',np.array(losses).mean())
            val_loss = self.validate()
            # if (ep >= self.num_epochs-10):
            path = f'weights_{ep}.path'
            torch.save(self.model.state_dict(),path)
            self.model_paths[path] = val_loss
            if val_loss < self.best_loss:
                self.best_loss = val_loss
            lr *= fact
            for g in self.opt.param_groups:
                g['lr'] = lr
            if self.end:
                break
        paths = sorted(self.model_paths.items(),key=lambda x: x[1],reverse=False)
        self.model_paths = [p for p,_ in paths[:6]]
        print(self.model_paths)
                
tr = Trainer(df_train,pretrained=None) # './weights_2.path')

tr.train(30,lr=1.6e-3,end_lr=8e-4)

In [None]:
class Evaluator():
    def __init__(self,models,data):
        self.data = data
        self.models = models
        self.dfs = [d for d in dfs if d!='nextDayPlayerEngagement']
        self.num_cats = NUM_CATS
        self.tars = torch.from_numpy(data[tar_cols].values).float()
        self.update()
        self.player_idx_0 = self.data.groupby('playerId').groups
        self.player_idx_0 = {g:list(l) for g,l in self.player_idx.items()}
        self.agg_tars = torch.from_numpy(data[[f'agg_tars_{k}' for k in range(1,5)]].values).float()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    def read_json(self,samp_df,test_df):
        dfs_test = {}
        for key in self.dfs:
            if test_df[key].iloc[0] == test_df[key].iloc[0]:
                df_temp = pd.read_json((test_df[key].iloc[0]))
            else:
                df_temp = pd.DataFrame({'playerId': samp_df['playerId']})
                for col in  training_dfs[key].keys():
                    if 'date' in col.lower():
                        df_temp[col] = samp_df['engagementMetricsDate'].values
                        continue
                    elif col == 'playerId': 
                        continue
                    df_temp[col] = np.nan
            dfs_test[key] = df_temp
        return dfs_test
        
    def unpack(self,samp_sub,test_dfs):
        test_dfs = self.read_json(samp_sub,test_dfs)
        pred_df = merge_df(samp_sub,test_dfs).reset_index(drop=True)
        pred_df = cat_transform(pred_df)
        self.start_index = self.data.index.max()
        pred_df.index += self.start_index
        self.data = pd.concat([self.data,pred_df]).reset_index(drop=True)
        self.start_index += 1
        self.ind = self.start_index + np.arange(len(pred_df))
        
    def update(self):
        self.date = self.data['gameDate'].values
        self.feats = torch.from_numpy(self.data[feats].fillna(-1).values).float()
        self.cat_feats = self.data[cat_feats].values.astype('int')
        self.player_idx = self.data.groupby('playerId').groups
        self.player_idx = {g:list(l) for g,l in self.player_idx.items()}
            
    def __getitem__(self,idx):
        idx_0 = self.ind[idx]
        id_ = self.data.loc[idx_0,'playerId']
        ind = self.player_idx[id_] # ind updated data
        ind_0 = self.player_idx_0[id_] # ind old data
        
        delta = len(ind) - len(ind_0)

        
        num = NUM_1*NUM 
        ##
        ind_1 = ind[-num:]
        ind_2 = ind_0[delta-num:]
        ##########
        feats = self.feats[ind_1]
        ###
        tar_feats = self.tars[ind_2]
        tar_feats_1 = torch.cat([tar_feats.mean(0),tar_feats.median(0)[0]],-1)
        tar_feats_1 = tar_feats_1[None,...] + 0*torch.zeros(len(feats),1)
        ###
        tar_feats = torch.cat([self.tars[ind_2],0*tar_feats[:,:1],self.agg_tars[ind_2]],-1)
        tar_pad = torch.zeros((len(feats)-len(tar_feats),9))
        tar_pad[:,4] = 1
        tar_feats = torch.cat([torch.log(1+tar_feats),tar_pad],0)
        feats = torch.cat([tar_feats,feats,tar_feats_1],-1)
        ###
        
        cat_feats = []
        for k,num in enumerate(self.num_cats):
            cat_feats.append(torch.eye(num)[self.cat_feats[ind_1,k]]) 
        cat_feats = torch.cat(cat_feats,-1)

        return feats,cat_feats
    
    def make_preds(self):
        preds = []
        # print(self.ind)
        # print(self.start_index)
        for idx in range(len(self.ind)):
            inp = self.__getitem__(idx)
            inp = [x[None,...].to(self.device) for x in inp]
            p = 0
            for m in self.models:
                with torch.no_grad():
                    p += (m(*inp).cpu().numpy()/len(self.models))[:,-1,:]
            preds.append(p)
        preds = np.concatenate(preds,0)
        
        return preds
    
    def infer(self,sub_df,test_inp):
        test_inp = self.unpack(sub_df,test_inp)
        self.update()
        pred = self.make_preds()
        # preds /= np.arange(len(deltas)+1).sum()
        self.pred_df = pd.DataFrame(pred,columns=['target1','target2','target3','target4'])
        return self.pred_df
        
        
# E = Evaluator([tr.model],df_train.head(100))

# Create Submission #

In [None]:
%%time
import mlb
import gc

models = []
for p in tr.model_paths:
    model = MLB_Model(UNITS).to(tr.device).eval()
    model.load_state_dict(torch.load(p))
    models.append(model)
    
del tr
gc.collect()

df_train = df_train.tail(1000000).reset_index(drop=True)
E = Evaluator(models,df_train)
env = mlb.make_env()
iter_test = env.iter_test()
step = 0
for (test_df, sample_prediction_df) in iter_test:
    # Unpack features from test_df
    print(step)
    gc.collect()
    step += 1
    # print(sample_prediction_df.head())
    samp_df = sample_prediction_df.reset_index(drop=True).copy()
    samp_df['engagementMetricsDate'] = sample_prediction_df.index.astype(str)
    samp_df['engagementMetricsDate']  = samp_df['engagementMetricsDate'] .astype('datetime64[ns]')
    
    # print(samp_df.head())
    samp_df['playerId'] = samp_df['date_playerId'].map(lambda x: int(x.split('_')[1]))
    
    pred_df = E.infer(samp_df,test_df)

    sample_prediction_df.loc[:,tar_cols] =  np.clip(pred_df.loc[:,tar_cols].values, 0, 100)
    
    print(sample_prediction_df[['target1','target2','target3','target4']].mean())

    # Submit predictions
    env.predict(sample_prediction_df)  # constructs submissions.csv