### CREDITS
* [baseline average 1.47](https://www.kaggle.com/mlconsult/baseline-average-1-47)
* [BaseLine Model: Player Mean or Median ?](https://www.kaggle.com/ulrich07/baseline-model-player-mean-or-median)
* [Fork - MLB baseline avergage 1.47](https://www.kaggle.com/junichih/mlb-baseline-median-1-45) 


In [None]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm import tqdm
import gc
from functools import reduce
from sklearn.model_selection import StratifiedKFold

In [None]:
def make_df(df, col, bool_in=False):
    tp = df.loc[ ~df[col].isnull() ,[col]].copy()
    df.drop(col, axis=1, inplace=True)
    
    tp[col] = tp[col].str.replace("null",'""')
    if bool_in:
        tp[col] = tp[col].str.replace("false",'"False"')
        tp[col] = tp[col].str.replace("true",'"True"')
    tp[col] = tp[col].apply(lambda x: eval(x) )
    a = tp[col].sum()
    gc.collect()
    return pd.DataFrame(a)
#===============

In [None]:
ROOT_DIR = "../input/mlb-player-digital-engagement-forecasting"

## UTILITY FUNCTIONS

In [None]:
#=======================#
def flatten(df, col):
    du = (df.pivot(index="playerId", columns="EvalDate", 
               values=col).add_prefix(f"{col}_").
      rename_axis(None, axis=1).reset_index())
    return du
#============================#
def reducer(left, right):
    return left.merge(right, on="playerId")
#========================

In [None]:
TGTCOLS = ["target1","target2","target3","target4"]
def train_lag(df, lag=1):
    dp = df[["playerId","EvalDate"]+TGTCOLS].copy()
    dp["EvalDate"]  =dp["EvalDate"] + timedelta(days=lag) 
    df = df.merge(dp, on=["playerId", "EvalDate"], suffixes=["",f"_{lag}"], how="left")
    return df
#=================================
def test_lag(sub):
    sub["playerId"] = sub["date_playerId"].apply(lambda s: int(  s.split("_")[1]  ) )
    assert sub.date.nunique() == 1
    dte = sub["date"].unique()[0]
    
    eval_dt = pd.to_datetime(dte, format="%Y%m%d")
    dtes = [eval_dt + timedelta(days=-k) for k in LAGS]
    mp_dtes = {eval_dt + timedelta(days=-k):k for k in LAGS}
    
    sl = LAST.loc[LAST.EvalDate.between(dtes[-1], dtes[0]), ["EvalDate","playerId"]+TGTCOLS].copy()
    sl["EvalDate"] = sl["EvalDate"].map(mp_dtes)
    du = [flatten(sl, col) for col in TGTCOLS]
    du = reduce(reducer, du)
    return du, eval_dt
    #
#===============

In [None]:
%%time
#tr = pd.read_csv(f"{ROOT_DIR}/train.csv")
tr = pd.read_csv("../input/mlb-data/target.csv")
print(tr.shape)
gc.collect()

In [None]:
tr["EvalDate"] = pd.to_datetime(tr["EvalDate"])
tr["EvalDate"] = tr["EvalDate"] + timedelta(days=-1)
tr["EvalYear"] = tr["EvalDate"].dt.year

In [None]:
MED_DF = tr.groupby(["playerId","EvalYear"])[TGTCOLS].median().reset_index()
MEDCOLS = ["tgt1_med","tgt2_med", "tgt3_med", "tgt4_med"]
MED_DF.columns = ["playerId","EvalYear"] + MEDCOLS

In [None]:
MED_DF.head()

In [None]:
MAX_LAG = 17
LAGS = list(range(1, MAX_LAG + 1))
FECOLS = [f"{col}_{lag}" for lag in reversed(LAGS) for col in TGTCOLS]

In [None]:
LAGS

In [None]:
%%time
for lag in tqdm(LAGS):
    tr = train_lag(tr, lag=lag)
    gc.collect()
#===========
tr = tr.sort_values(by=["playerId", "EvalDate"])
print(tr.shape)
tr = tr.dropna()
print(tr.shape)
tr = tr.merge(MED_DF, on=["playerId","EvalYear"])
gc.collect()

In [None]:
tr.head(1)

In [None]:
X = tr[FECOLS+MEDCOLS].values
y = tr[TGTCOLS].values
cl = tr["playerId"].values

In [None]:
X.shape

## Neural Net Inference

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import copy
import seaborn as sns
import gc

from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from datetime import timedelta
from tqdm import tqdm
import gc
from functools import reduce
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import pickle

import warnings
warnings.filterwarnings('ignore')


DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(seed=42)


############## Models ######################################################################

class simple_MLP_block(nn.Module):
    def __init__(self, input_dim, keep_prob, out_dim):
        super(simple_MLP_block, self).__init__()
        self.keep_prob = keep_prob
        self.batch_norm = nn.BatchNorm1d(input_dim)
        if keep_prob!=0:
            self.dropout = nn.Dropout(keep_prob)
        self.dense = nn.Linear(input_dim, out_dim)

    def forward(self, x):
        x = self.batch_norm(x)
        if self.keep_prob!=0:
            x = self.dropout(x)
        x = self.dense(x)

        return x

class Simple_MLP_Model(nn.Module):  # <-- Update
    def __init__(self, input_dim, hidden_dim, out_dim=4):
        super(Simple_MLP_Model, self).__init__()

        self.block1 = simple_MLP_block(input_dim, 0, hidden_dim)
        self.block2 = simple_MLP_block(hidden_dim, 0.2, int(hidden_dim))
        self.block3 = simple_MLP_block(int(hidden_dim), 0.2, out_dim)

    def forward(self, x):

        x = F.leaky_relu(self.block1(x))
        x = F.leaky_relu(self.block2(x))
        out = self.block3(x)

        return out


class ResidualBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, downsample=None):
        super(ResidualBlock, self).__init__()
        self.bn1 = nn.BatchNorm1d(input_dim)
        #self.dropout1 = nn.Dropout(input_dim/2000)
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.LeakyReLU()

        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.dropout2 = nn.Dropout(0.1)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.bn1(x)
        #out = self.dropout1(out)
        out = self.fc1(out)
        out = self.relu1(out)
        out1 = self.bn2(out)
        out1 = self.dropout2(out1)
        out1 = self.fc2(out1)
        if self.downsample:
            residual = self.downsample(x)
        out1 += residual

        return out1, out

class ResNet(nn.Module):

    def __init__(self, input, hidden_dim, block, num_classes=4):
        super(ResNet, self).__init__()

        self.layer1 = self.make_layer(block, input, hidden_dim, int(hidden_dim))
        self.layer2 = self.make_layer(block, int(hidden_dim), int(hidden_dim), num_classes)

    def make_layer(self, block, input_dim, hidden_dim, output_dim):
        downsample = None
        if (input_dim != output_dim):
            downsample = nn.Sequential(
                nn.BatchNorm1d(input_dim),
                nn.Dropout(0.1),
                nn.Linear(input_dim, output_dim))

        layer=block(input_dim, hidden_dim, output_dim, downsample)

        return layer

    def forward(self, x):
        out, _ = self.layer1(x)
        out, _ = self.layer2(F.leaky_relu(out))

        return out


####################### Final Ensemble Model ######################################

class Ensemble_MoaModel(nn.Module):

    def __init__(self, input_dims, number_of_dims, hidden_dims, model_name, out_dim=206):
        super(Ensemble_MoaModel, self).__init__()

        self.models = torch.nn.ModuleList()
        self.input_dims = input_dims
        self.model_name = model_name

        for i in range(len(hidden_dims)):
            if self.model_name=='Simple_MLP':
                self.models.append(Simple_MLP_Model(input_dim=number_of_dims[i],
                                                    hidden_dim=hidden_dims[i], out_dim=out_dim))

            elif self.model_name=="ResNet":
                self.models.append(ResNet(input=number_of_dims[i],
                                          hidden_dim=hidden_dims[i],
                                          block=ResidualBlock,
                                          num_classes=out_dim))
            else:
                print("Please check model name. There is no this model!!!")

    def forward(self, x):
        out = []

        for i in range(len(self.input_dims)):
            temp = self.models[i](x[:, self.input_dims[i]])
            out.append(temp.unsqueeze(0))

        out = torch.cat(out, dim=0)
        out = out.permute(1, 0, 2)
        out = torch.cat([out, torch.mean(out, dim=1).unsqueeze(1)], dim=1)

        return out

############################### Create Pytorch Dataset ########################################

class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return (self.features.shape[0])

    def __getitem__(self, idx):
        dct = {
            'x': torch.tensor(self.features[idx, :], dtype=torch.float),
            'y': torch.tensor(self.targets[idx, :], dtype=torch.float),
        }
        return dct

class TestDataset:
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return (self.features.shape[0])

    def __getitem__(self, idx):
        dct = {
            'x': torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct

################################# training inference #####################################
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):

    model.train()
    final_loss = 0

    for data in tqdm(dataloader):
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)

        outputs = model(inputs)
        loss = 0

        for i in range(outputs.shape[1]):
            loss = loss + loss_fn(outputs[:, i, :],
                                  targets.to(device))

        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()

    final_loss /= len(dataloader)

    return final_loss

################################# validation inference #####################################
def valid_fn(model, dataloader, device, scheduler=None, loss_fn=None):

    model.eval()
    final_loss = 0
    valid_preds = []
    val_y = []

    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        val_y.append(targets.detach().cpu().numpy())

        outputs = model(inputs)

        if loss_fn==None:
            valid_preds.append(np.mean(outputs.detach().cpu().numpy(), axis=1))
            final_loss = 0
        else:
            loss = loss_fn(torch.mean(outputs, dim=1), targets.to(device))
            final_loss += loss.item()
            valid_preds.append(np.mean(outputs.detach().cpu().numpy(), axis=1))

    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    val_y = np.concatenate(val_y)

    score = mean_absolute_error(val_y, valid_preds)

    if scheduler!=None:
        scheduler.step(score)

    return final_loss, score, valid_preds

################################# prediction inference #####################################
def inference_fn(model, dataloader, device):
    model.eval()
    preds = []

    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)

        preds.append(np.mean(outputs.detach().cpu().numpy(), axis=1))

    preds = np.concatenate(preds)

    return preds

################################# Data preprocess #####################################
def make_df(df, col, bool_in=False):
    tp = df.loc[~df[col].isnull(), [col]].copy()
    df.drop(col, axis=1, inplace=True)

    tp[col] = tp[col].str.replace("null", '""')
    if bool_in:
        tp[col] = tp[col].str.replace("false", '"False"')
        tp[col] = tp[col].str.replace("true", '"True"')
    tp[col] = tp[col].apply(lambda x: eval(x))
    a = tp[col].sum()
    gc.collect()
    return pd.DataFrame(a)

def flatten(df, col):
    du = (df.pivot(index="playerId", columns="EvalDate",
               values=col).add_prefix(f"{col}_").
      rename_axis(None, axis=1).reset_index())
    return du

def reducer(left, right):
    return left.merge(right, on="playerId")

def train_lag(df, target_cols, lag=1):
    dp = df[["playerId","EvalDate"]+target_cols].copy()
    dp["EvalDate"]  =dp["EvalDate"] + timedelta(days=lag)
    df = df.merge(dp, on=["playerId", "EvalDate"], suffixes=["",f"_{lag}"], how="left")
    return df


################################# Model Training Phase #####################################
def run_training(folds, target, feature_cols, target_cols,
                 input_dims, number_of_dims, hidden_dims,
                 model_name, BATCH_SIZE, LEARNING_RATE, WEIGHT_DECAY,
                 EPOCHS, EARLY_STOP, EARLY_STOPPING_STEPS, fold, seed, MODEL_ROOT):

    seed_everything(seed)
    folds = folds.reset_index(drop=True)
    val_idx = folds[folds['kfold'] == fold].index

    train_df = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_df = folds[folds['kfold'] == fold].reset_index(drop=True)

    x_train, y_train = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid = valid_df[feature_cols].values, valid_df[target_cols].values

    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

    if os.path.isfile(MODEL_ROOT + "/"+ f"FOLD_{fold}_{seed}.pth"):

        model_new = torch.load(MODEL_ROOT + "/"+ f"FOLD_{fold}_{seed}.pth")
        model_new.to(DEVICE)
        model_new.eval()
        oof = np.zeros((len(folds), target.shape[1]))
        valid_loss, valid_score, valid_preds = valid_fn(model_new, validloader, DEVICE, scheduler=None, loss_fn=None)
        oof[val_idx] = valid_preds

    else:

        model_new = Ensemble_MoaModel(input_dims, number_of_dims, hidden_dims, model_name, out_dim=4)
        model_new.to(DEVICE)

        optimizer = torch.optim.Adam(model_new.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
                                                  max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))

        # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=4,
        #                               factor=0.25, verbose=True)


        loss_fn = nn.L1Loss()

        early_stopping_steps = EARLY_STOPPING_STEPS
        early_step = 0

        oof = np.zeros((len(folds), target.shape[1]))
        best_loss = np.inf

        for epoch in range(EPOCHS):

            train_loss = train_fn(model_new, optimizer, scheduler, loss_fn, trainloader, DEVICE)

            valid_loss, valid_score, valid_preds = valid_fn(model_new, validloader, DEVICE, scheduler=None, loss_fn=loss_fn)
            print(f"FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}, valid_loss: {valid_loss},  valid_score: {valid_score}")

            if valid_score < best_loss:

                best_loss = valid_score
                oof[val_idx] = valid_preds
                torch.save(model_new, MODEL_ROOT + "/"+ f"FOLD_{fold}_{seed}.pth")

            elif (EARLY_STOP == True):

                early_step += 1
                if (early_step >= early_stopping_steps):
                    break

            del train_loss, valid_loss, valid_score
            gc.collect()

        model_new = torch.load(MODEL_ROOT + "/"+ f"FOLD_{fold}_{seed}.pth")
        model_new.to(DEVICE)

    return oof


################################# FOLD SELECTION #####################################

def fold_selection(folds, target_cols, NFOLDS=5):

    train = folds.copy()
    skf = StratifiedKFold(n_splits=NFOLDS)

    for f, (t_idx, v_idx) in enumerate(skf.split(X=train, y=train[target_cols])):
        folds.loc[v_idx, 'kfold'] = int(f)

    folds['kfold'] = folds['kfold'].astype(int)

    return folds

################################# ENSEMBLE MODELS STRUCTURE GENERATION #####################################

def ENSEMBLE_MODEL_STRUCTURE(number_models, MODEL_ROOT):

    if number_models == 1:

        hidden_dims = [256]
        number_of_dims = [num_features]
        input_dims = [np.arange(num_features)]

    elif os.path.isfile(MODEL_ROOT + "/" + 'hidden_dims.pkl'):
        with open(MODEL_ROOT + "/" + 'hidden_dims.pkl', 'rb') as handle:
            hidden_dims = pickle.load(handle)

        with open(MODEL_ROOT + "/" + 'number_of_dims.pkl', 'rb') as handle:
            number_of_dims = pickle.load(handle)

        with open(MODEL_ROOT + "/" + 'input_dims.pkl', 'rb') as handle:
            input_dims = pickle.load(handle)

    else:

        hidden_dims = np.random.randint(128, 384, number_models)
        number_of_dims = np.random.randint(int(num_features*0.75), num_features, number_models)
        input_dims = []
        for i in range(number_models):
            input_dims.append(np.random.randint(0, num_features, number_of_dims[i]))

        with open(MODEL_ROOT + "/" + 'hidden_dims.pkl', 'wb') as handle:
            pickle.dump(hidden_dims, handle, protocol=pickle.HIGHEST_PROTOCOL)

        with open(MODEL_ROOT + "/" + 'number_of_dims.pkl', 'wb') as handle:
            pickle.dump(number_of_dims, handle, protocol=pickle.HIGHEST_PROTOCOL)

        with open(MODEL_ROOT + "/" + 'input_dims.pkl', 'wb') as handle:
            pickle.dump(input_dims, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return hidden_dims, number_of_dims, input_dims

##################### Model training phase for each seed ######################

def run_k_fold(folds, target, feature_cols, target_cols, input_dims, number_of_dims, hidden_dims,
                 model_name, BATCH_SIZE, LEARNING_RATE, WEIGHT_DECAY,
                 EPOCHS, EARLY_STOP, EARLY_STOPPING_STEPS, NFOLDS, seed, MODEL_ROOT):

    oof = np.zeros((len(folds), len(target_cols)))

    for fold in range(NFOLDS):
        oof_ = run_training(folds, target, feature_cols, target_cols,
                 input_dims, number_of_dims, hidden_dims,
                 model_name, BATCH_SIZE, LEARNING_RATE, WEIGHT_DECAY,
                 EPOCHS, EARLY_STOP, EARLY_STOPPING_STEPS, fold, seed, MODEL_ROOT)

        oof += oof_

    return oof



In [None]:
nets = []

for kf in range(5):
    model = torch.load(f'../input/mlb-resnet-ensemble-architecture/FOLD_{kf}_20.pth')
    nets.append(model)

In [None]:
# Historical information to use in prediction time
bound_dt = pd.to_datetime("2021-01-01")
LAST = tr.loc[tr.EvalDate>bound_dt].copy()

In [None]:
LAST_MED_DF = MED_DF.loc[MED_DF.EvalYear==2021].copy()
LAST_MED_DF.drop("EvalYear", axis=1, inplace=True)
del tr

In [None]:
LAST.shape, LAST_MED_DF.shape, MED_DF.shape

In [None]:
#nets[0].summary()

In [None]:
#"""
import mlb
FE = []; SUB = [];
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sub) in iter_test:
    # Features computation at Evaluation Date
    sub = sub.reset_index()
    sub_fe, eval_dt = test_lag(sub)
    sub_fe = sub_fe.merge(LAST_MED_DF, on="playerId", how="left")
    sub_fe = sub_fe.fillna(0.)
    
    _preds = 0.
    for reg in nets:
        _preds += reg(torch.tensor(sub_fe[FECOLS + MEDCOLS].values, dtype=torch.float).to(DEVICE)) / 5
    
    _preds = np.mean(_preds.detach().cpu().numpy(), axis=1)
    sub_fe[TGTCOLS] = np.clip(_preds, 0, 100)
    sub.drop(["date"]+TGTCOLS, axis=1, inplace=True)
    sub = sub.merge(sub_fe[["playerId"]+TGTCOLS], on="playerId", how="left")
    sub.drop("playerId", axis=1, inplace=True)
    sub = sub.fillna(0.)
    # Submit
    env.predict(sub)
    # Update Available information
    sub_fe["EvalDate"] = eval_dt
    #sub_fe.drop(MEDCOLS, axis=1, inplace=True)
    LAST = LAST.append(sub_fe)
    LAST = LAST.drop_duplicates(subset=["EvalDate","playerId"], keep="last")
#"""

In [None]:
sub.head()

In [None]:
LAST.shape, sub_fe.shape

In [None]:
#df_tr["dte"] = pd.to_datetime(df_tr["date"], format='%Y%m%d')