In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.metrics import mean_absolute_error
from scipy.stats import norm, mode
from tqdm import tqdm
from functools import reduce, partial

import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import matplotlib.pyplot as plt
import optuna
import datetime as dt
import gc
import joblib

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F

import math

In [None]:
data_dir = '../input/mlb-player-digital-engagement-forecasting'

In [None]:
awards = pd.read_csv(f'{data_dir}/awards.csv')
players = pd.read_csv(f'{data_dir}/players.csv')
seasons = pd.read_csv(f'{data_dir}/seasons.csv')
teams = pd.read_csv(f'{data_dir}/teams.csv')
train = pd.read_csv(f'{data_dir}/train_updated.csv')

In [None]:
LAGS = list(range(1,15))
targets = ['target1', 'target2', 'target3', 'target4']
id_cols = ['playerId']
standings_cols = [
    'wins', 'losses', 'pct', 'xWinLossPct', 
    'divisionRank', 'lastTenWins', 'lastTenLosses'
]
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status', 'gameDate']
games_cols = [
    'gameDate', 'gameTimeUTC', 'homeId', 'awayId', 'gameType', 
    'homeWins', 'homeLosses', 'homeScore', 'awayWins', 'awayLosses', 'awayScore'
]
long_games_cols = [
    'gameDate', 'gameTimeUTC', 'teamId', 'oppId', 'gameType', 
    'teamWins', 'teamLosses', 'teamScore', 'oppWins', 'oppLosses', 
    'oppScore', 'isHome', 'teamWon', 'scoreDiff'
]
long_games_features = [
    'gameTimeUTC', 'wasSigned', 'wasTraded',
    'teamWins', 'teamLosses', 'teamScore', 'isHome', 'teamWon', 'scoreDiff'
]
awards_cols = [
    'awardName'
]

hitter_cols = [
    'gamesPlayedBatting', 'hits', 'doubles', 'triples', 'runsScored',
    'homeRuns', 'hitByPitch', 'totalBases', 'rbi', 'stolenBases', 'assists'
]
pitcher_cols = [
    'gamesPlayedPitching', 'completeGamesPitching', 'shutoutsPitching', 'earnedRuns', 'winsPitching', 
    'strikeOutsPitching', 'hitsPitching', 'saveOpportunities', 'saves', 'holds', 'inningsPitched'
]
scores_cols = hitter_cols + pitcher_cols
day_of_cols = ['month', 'day']
categorical_cols = ['label_playerId', 'label_teamId', 'label_status', 'label_primaryPositionName']  

players_cols = ['playerId', 'primaryPositionName']

def preprocess_data(raw_data):
    data = raw_data.copy()
    data['date'] = pd.to_datetime(data['date'], format='%Y%m%d')
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day
    data['day_of_week'] = data['date'].dt.day_name()

    data = data.loc[data['date'] > dt.datetime(2020,12,1), :]

    return data

def preprocess_sub_dfs(
    rosters, player_box_scores, team_box_scores, transactions, games,
    standings, awards, events, player_followers, team_followers, 
    next_day_player_engagement=None
):
    if rosters is not None:
        rosters['gameDate'] = pd.to_datetime(rosters['gameDate'])

    if player_box_scores is not None:
        player_box_scores['gameDate'] = pd.to_datetime(player_box_scores['gameDate'])

    if transactions is not None:
        transactions['gameDate'] = pd.to_datetime(transactions['date'])
        transactions['wasTraded'] = np.where(transactions['typeDesc'] == 'Trade', 1, 0)
        transactions['wasSigned'] = np.where(transactions['typeDesc'].isin(['Signed', 'Signed as Free Agent']), 1, 0)
        transactions = transactions[transactions['playerId'].notnull()]

    if standings is not None:
        standings['leagueGamesBack'] = np.where(standings['leagueGamesBack'] == '-', '0.0', standings['leagueGamesBack'])
        standings['leagueGamesBack'] = pd.to_numeric(standings['leagueGamesBack'])
    
    if player_followers is not None:
        player_followers['year'] = player_followers['date'].dt.year
        player_followers['month'] = player_followers['date'].dt.month
    
    if awards is not None:
        awards['awardDate'] = pd.to_datetime(awards['awardDate'])
        awards.rename(columns={'awardDate': 'gameDate'}, inplace=True)
    
    if team_followers is not None:
        team_followers['year'] = team_followers['date'].dt.year
        team_followers['month'] = team_followers['date'].dt.month

    if next_day_player_engagement is not None:
        next_day_player_engagement['engagementMetricsDate'] = pd.to_datetime(next_day_player_engagement['engagementMetricsDate'])
    
        return (
            rosters, player_box_scores, team_box_scores, transactions, games,
            standings, awards, events, player_followers, team_followers, 
            next_day_player_engagement
        )
    return (
        rosters, player_box_scores, team_box_scores, transactions, games,
        standings, awards, events, player_followers, team_followers
    )

def load_inner_dfs(data, is_test=False):
    rosters, player_box_scores, team_box_scores, transactions, games, standings, awards, events, player_followers, team_followers = [], [], [], [], [], [], [], [], [], []
    if not is_test:
        next_day_player_engagement = []
        all_dfs = [rosters, player_box_scores, team_box_scores, transactions, games, standings, awards, events, player_followers, team_followers, next_day_player_engagement]
    else:
        all_dfs = [rosters, player_box_scores, team_box_scores, transactions, games, standings, awards, events, player_followers, team_followers]

    for row in data.itertuples():
        if isinstance(row.rosters, str):
            rosters.append(pd.read_json(row.rosters))
        if isinstance(row.playerBoxScores, str):
            player_box_scores.append(pd.read_json(row.playerBoxScores))
        if isinstance(row.teamBoxScores, str):
            team_box_scores.append(pd.read_json(row.teamBoxScores))
        if isinstance(row.transactions, str):
            transactions.append(pd.read_json(row.transactions))
        if isinstance(row.games, str):
            games.append(pd.read_json(row.games))
        if isinstance(row.standings, str):
            standings.append(pd.read_json(row.standings))
        if isinstance(row.awards, str):
            awards.append(pd.read_json(row.awards))
        if isinstance(row.events, str):
            events.append(pd.read_json(row.events))
        if isinstance(row.playerTwitterFollowers, str):
            player_followers.append(pd.read_json(row.playerTwitterFollowers))
        if isinstance(row.teamTwitterFollowers, str):
            team_followers.append(pd.read_json(row.teamTwitterFollowers))
        if not is_test and isinstance(row.nextDayPlayerEngagement, str):
            next_day_player_engagement.append(pd.read_json(row.nextDayPlayerEngagement))
    
    if not is_test:
        return tuple([pd.concat(df, ignore_index=True) for df in all_dfs])
    else:
        return tuple([df[0] if len(df) > 0 else None for df in all_dfs])

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def process_games(games):
    home_games = games.loc[:, games_cols].rename(columns={
        'homeId': 'teamId',
        'awayId': 'oppId',
        'homeWins': 'teamWins',
        'homeLosses': 'teamLosses',
        'homeScore': 'teamScore',
        'awayWins': 'oppWins',
        'awayLosses': 'oppLosses',
        'awayScore': 'oppScore',
    })
    home_games['isHome'] = 1
    home_games['teamWon'] = np.where(home_games['teamScore'] > home_games['oppScore'], 1, 0)
    home_games['scoreDiff'] = home_games['teamScore'] - home_games['oppScore']
    home_games = home_games.loc[:, long_games_cols]

    away_games = games.loc[:, games_cols].rename(columns={
        'awayId': 'teamId',
        'homeId': 'oppId',
        'awayWins': 'teamWins',
        'awayLosses': 'teamLosses',
        'awayScore': 'teamScore',
        'homeWins': 'oppWins',
        'homeLosses': 'oppLosses',
        'homeScore': 'oppScore',
    })
    away_games['isHome'] = 0
    away_games['teamWon'] = np.where(away_games['teamScore'] > away_games['oppScore'], 1, 0)
    away_games['scoreDiff'] = away_games['teamScore'] - away_games['oppScore']
    away_games = away_games.loc[:, long_games_cols]

    long_games = pd.concat([home_games, away_games], ignore_index=True)
    long_games['gameDate'] = pd.to_datetime(long_games['gameDate'])
    long_games['gameTimeUTC'] = pd.to_datetime(long_games['gameTimeUTC']).dt.hour

    long_games = long_games.groupby(['teamId', 'gameDate']).agg({
        c: 'sum' if c not in ['gameTimeUTC', 'gameType', 'oppId'] else 'last' for c in long_games_features if c not in ['wasSigned', 'wasTraded']
    })

    return long_games.reset_index()

def feature_engineering(next_day_player_engagement, player_box_scores, rosters, awards, games, standings, player_followers, team_followers, transactions):
    scores = player_box_scores.drop(columns='teamId').groupby(['playerId', 'gameDate']).sum().reset_index()
    scores = scores.sort_values(by=['gameDate', 'playerId'])
    scores['days_since_last_game'] = scores.groupby(['playerId']).agg({'gameDate': 'diff'})
    in_sample_players_df = players.loc[players['playerForTestSetAndFuturePreds'] == True, players_cols]
    # in_sample_players_df = players
    long_games = process_games(games)
    standings['gameDate'] = pd.to_datetime(standings['gameDate'])
    transactions = transactions[['gameDate', 'playerId', 'wasSigned', 'wasTraded']].drop_duplicates(subset=['gameDate', 'playerId'])
    awards = awards.groupby(['gameDate', 'playerId']).agg({'awardName': 'last'}).reset_index()
    df = next_day_player_engagement.merge(
            rosters, on=['playerId', 'gameDate'], how='left',
        ).merge(scores, on=['gameDate', 'playerId'], how='left').merge(
            player_followers[['playerId', 'year', 'month', 'numberOfFollowers']],
            on=['playerId', 'year', 'month'],
            how='left'
        ).rename(columns={'numberOfFollowers': 'player_followers'}).merge(
            in_sample_players_df[players_cols], on='playerId', how='inner'
        ).merge(
            team_followers[['teamId', 'year', 'month', 'numberOfFollowers']], on=['teamId', 'year', 'month'], how='left'
        ).rename(columns={'numberOfFollowers': 'team_followers'}).merge(
            long_games[['teamId', 'gameDate'] + [c for c in long_games_features if c not in ['wasSigned', 'wasTraded']]], on=['teamId', 'gameDate'], how='left'
        ).merge(
            standings[['teamId', 'gameDate'] + standings_cols], on=['teamId', 'gameDate'], how='left'
        ).merge(
            transactions, on=['gameDate', 'playerId'], how='left'
        )
    
    return df

#=======================#
def flatten(df, col):
    du = (df.pivot(index="playerId", columns="gameDate", 
               values=col).add_prefix(f"{col}_").
      rename_axis(None, axis=1).reset_index())
    return du
#============================#
def reducer(left, right):
    return left.merge(right, on="playerId")
#========================

TGTCOLS = ["target1","target2","target3","target4"]
def train_lag(df, lag=1):
    dp = df[["playerId","gameDate"]+scores_cols+long_games_features+standings_cols+season_stats_cols].copy()
    dp["gameDate"]  =dp["gameDate"] + dt.timedelta(days=lag) 
    df = df.merge(dp, on=["playerId", "gameDate"], suffixes=["",f"_{lag}"], how="left")
    return df
#=================================
def test_lag(sub):
    sub["playerId"] = sub["date_playerId"].apply(lambda s: int(  s.split("_")[1]  ) )
    assert sub.date.nunique() == 1
    dte = sub["date"].unique()[0]
    
    eval_dt = pd.to_datetime(dte, format="%Y%m%d")
    dtes = [eval_dt + dt.timedelta(days=-k) for k in LAGS]
    mp_dtes = {eval_dt + dt.timedelta(days=-k):k for k in LAGS}
    #     sl = LAST.loc[LAST.gameDate.between(dtes[-1], dtes[0]), ["gameDate","playerId"]+TGTCOLS].copy()
    sl = LAST.loc[LAST.gameDate.isin([d for d in dtes]), ["gameDate","playerId"]+scores_cols+long_games_features+standings_cols+season_stats_cols].copy()
    sl["gameDate"] = sl["gameDate"].map(mp_dtes)
    du = [flatten(sl, col) for col in scores_cols+long_games_features+standings_cols+season_stats_cols]
    du = reduce(reducer, du)
    return du, eval_dt

In [None]:
train = preprocess_data(train)

rosters, player_box_scores, team_box_scores, transactions, games, standings, awards, events, player_followers, team_followers, next_day_player_engagement = load_inner_dfs(train, is_test=False)

rosters, player_box_scores, team_box_scores, transactions, games, standings, awards, events, player_followers, team_followers, next_day_player_engagement = preprocess_sub_dfs(
    rosters, player_box_scores, team_box_scores, transactions, games,
    standings, awards, events, player_followers, team_followers, 
    next_day_player_engagement
)

In [None]:
next_day_player_engagement['gameDate'] = next_day_player_engagement['engagementMetricsDate'] - dt.timedelta(days=1)
next_day_player_engagement['year'] = next_day_player_engagement['gameDate'].dt.year
next_day_player_engagement['month'] = next_day_player_engagement['gameDate'].dt.month
next_day_player_engagement['day'] = next_day_player_engagement['gameDate'].dt.day
next_day_player_engagement['day_of_week'] = next_day_player_engagement['gameDate'].dt.day_name()
data = feature_engineering(next_day_player_engagement, player_box_scores, rosters, awards, games, standings, player_followers, team_followers, transactions)

In [None]:
del rosters, player_box_scores, team_box_scores, transactions, games, standings, awards, events, player_followers, team_followers, train
gc.collect();

In [None]:
data = data.loc[data['gameDate'] >= dt.datetime(2020,12,31), :]

In [None]:
def agg_targets(df, agg_func):
    if agg_func == 'std':
        target_agg = df.groupby(['playerId', 'lag_year', 'lag_month']).agg({t: lambda x: np.std(x) for t in targets}).reset_index()
    else:
        target_agg = df.groupby(['playerId', 'lag_year', 'lag_month']).agg({t: agg_func for t in targets}).reset_index()
    agg_cols = [f'{t}_{agg_func}' for t in targets]
    target_agg.columns = ['playerId', 'lag_year', 'lag_month'] + agg_cols
    return target_agg, agg_cols


# %%
data['lag_month'] = (data['gameDate'] + dt.timedelta(days=31)).dt.month
data['lag_year'] = (data['gameDate'] + dt.timedelta(days=31)).dt.year
next_day_player_engagement['lag_month'] = (next_day_player_engagement['gameDate'] + dt.timedelta(days=31)).dt.month
next_day_player_engagement['lag_year'] = (next_day_player_engagement['gameDate'] + dt.timedelta(days=31)).dt.year

target_means, mean_cols = agg_targets(next_day_player_engagement, 'mean')
target_medians, median_cols = agg_targets(next_day_player_engagement, 'median')
target_stds, std_cols = agg_targets(next_day_player_engagement, 'std')
target_mins, min_cols = agg_targets(next_day_player_engagement, 'min')
target_maxs, max_cols = agg_targets(next_day_player_engagement, 'max')

del next_day_player_engagement
gc.collect();

agg_cols = mean_cols + median_cols + std_cols + min_cols + max_cols

player_target_stats = target_means.merge(
    target_medians, on=['playerId', 'lag_year', 'lag_month']
).merge(
    target_stds, on=['playerId', 'lag_year', 'lag_month']
).merge(
    target_mins, on=['playerId', 'lag_year', 'lag_month']
).merge(
    target_maxs, on=['playerId', 'lag_year', 'lag_month']
)

cum_stats = data[['playerId', 'year', 'gameDate'] + scores_cols].fillna(0).groupby(['playerId', 'year']).rolling(
    365, min_periods=0, on='gameDate'
)[scores_cols].sum().reset_index().rename(columns={
    c: f'{c}_season' for c in scores_cols
}).drop(columns='year')
season_stats_cols = [f'{c}_season' for c in scores_cols]

data = data.merge(cum_stats, on=['playerId', 'gameDate'], how='left')
print(data.shape)
data = data.merge(player_target_stats, left_on=['playerId', 'year', 'month'], right_on=['playerId', 'lag_year', 'lag_month'])
print(data.shape)

for lag in tqdm(LAGS):
    data = train_lag(data, lag=lag)
    gc.collect()

lag_cols = [f'{t}_{l}' for l in LAGS for t in targets]
scores_lag_cols = scores_cols + [f'{c}_{l}' for l in LAGS for c in scores_cols]
long_games_lag_cols = long_games_features + [f'{c}_{l}' for l in LAGS for c in long_games_features]
standings_lag_cols = standings_cols + [f'{c}_{l}' for l in LAGS for c in standings_cols]
season_stats_lag_cols = season_stats_cols + [f'{c}_{l}' for l in LAGS for c in season_stats_cols]

In [None]:
features = categorical_cols + agg_cols + scores_lag_cols + standings_lag_cols + long_games_lag_cols
features += season_stats_lag_cols + ['player_followers', 'team_followers']

In [None]:
del target_means
del target_medians
del target_stds
del target_mins
del target_maxs
gc.collect()

In [None]:
player2num = joblib.load('../input/mlbplfinalmodels/player2num.pkl')
position2num = joblib.load('../input/mlbplfinalmodels/position2num.pkl')
teamid2num = joblib.load('../input/mlbplfinalmodels/teamid2num.pkl')
status2num = joblib.load('../input/mlbplfinalmodels/status2num.pkl')

In [None]:
class Dataset:
    def __init__(self, X, y=None):
        self.X = X[features]
        self.cont_feats = [self.X.columns.get_loc(c) for c in agg_cols+['player_followers', 'team_followers'] if c in self.X]
        self.cat_feats = [self.X.columns.get_loc(c) for c in categorical_cols if c in self.X]
        self.scores_lags = [self.X.columns.get_loc(c) for c in scores_lag_cols if c in self.X]
        self.games_lags = [self.X.columns.get_loc(c) for c in long_games_lag_cols if c in self.X]
        self.standings_lags = [self.X.columns.get_loc(c) for c in standings_lag_cols if c in self.X]
        self.season_stats_lags = [self.X.columns.get_loc(c) for c in season_stats_lag_cols if c in self.X]
        self.X = self.X.values
        if y is not None:
            self.y = y.values
        else:
            self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        row = self.X[idx, :]
        d = {
            'cont_feats': torch.as_tensor(row[self.cont_feats], dtype=torch.float32),
            'cat_feats': torch.as_tensor(row[self.cat_feats], dtype=torch.long),
            'scores_lags': torch.as_tensor(row[self.scores_lags], dtype=torch.float32),
            'games_lags': torch.as_tensor(row[self.games_lags], dtype=torch.float32),
            'standings_lags': torch.as_tensor(row[self.standings_lags], dtype=torch.float32),
            'season_stats_lags': torch.as_tensor(row[self.season_stats_lags], dtype=torch.float32),
        }
        if self.y is not None:
            d['y'] = torch.as_tensor(self.y[idx], dtype=torch.float32)
            return d
        else:
            return d

class MLBDataModule(pl.LightningDataModule):
    def __init__(self, X_train, y_train=None, X_val=None, y_val=None, batch_size=2048):
        super().__init__()
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.batch_size = batch_size

    def train_dataloader(self):
        train_dataset = Dataset(self.X_train, self.y_train)
        return DataLoader(
            train_dataset, batch_size=self.batch_size,
            shuffle=True, num_workers=4, pin_memory=True, drop_last=False
        )

    def val_dataloader(self):
        val_dataset = Dataset(self.X_val, self.y_val)
        return DataLoader(
            val_dataset, batch_size=self.batch_size,
            shuffle=False, num_workers=4, pin_memory=True, drop_last=False
        )

    def predict_dataloader(self):
        pred_dataset = Dataset(self.X_train)
        return DataLoader(
            pred_dataset, batch_size=self.batch_size,
            shuffle=False, num_workers=4, pin_memory=True
        )

    def test_dataloader(self):
        pass

In [None]:
def get_emb_params(d):
    unq = len(d)
    return unq+1, min(32, int((1.6 * unq)**0.56))

def conv_layer(conv_p, n_layers):
    layers = [nn.Sequential(
        nn.Conv1d(
            in_channels=len(LAGS)+1,
            out_channels=len(LAGS)+1,
            kernel_size=4
        ),
        nn.BatchNorm1d(len(LAGS)+1),
        nn.ReLU(),
        nn.Dropout(conv_p)
    ) for _ in range(n_layers)]
    return nn.Sequential(*layers)

def create_gru_head(ff_layer_sizes, ff_drop_ps, cont_len, cat_len, hidden_dim):
    ff_layers = nn.ModuleList([])
    for i, sz in enumerate(range(len(ff_layer_sizes))):
        in_dim = cat_len + cont_len + (hidden_dim*3 - 9) if i == 0 else ff_layer_sizes[i-1]
        if i == 0:
            ff_layers.append(nn.BatchNorm1d(in_dim))
            ff_layers.append(nn.ReLU())
        ff_layers.append(nn.Linear(in_dim, ff_layer_sizes[i]))
        ff_layers.append(nn.BatchNorm1d(ff_layer_sizes[i]))
        ff_layers.append(nn.ReLU())
        ff_layers.append(nn.Dropout(ff_drop_ps[i]))

    return nn.Sequential(*ff_layers, nn.Linear(ff_layer_sizes[-1], 1))

def create_transformer_head(ff_layer_sizes, ff_drop_ps, hidden_dim):
    ff_layers = nn.ModuleList([])
    for i, sz in enumerate(range(len(ff_layer_sizes))):
        in_dim = hidden_dim-9 if i == 0 else ff_layer_sizes[i-1]
        ff_layers.append(nn.Linear(in_dim, ff_layer_sizes[i]))
        ff_layers.append(nn.BatchNorm1d(ff_layer_sizes[i]))
        ff_layers.append(nn.ReLU())
        ff_layers.append(nn.Dropout(ff_drop_ps[i]))

    return nn.Sequential(*ff_layers, nn.Linear(ff_layer_sizes[-1], 1))

def future_mask(seq_length):
    future_mask = (np.triu(np.ones([seq_length, seq_length]), k = 1)).astype('bool')
    return torch.from_numpy(future_mask)

class MLBGRU(pl.LightningModule):
    def __init__(self, batch_size, lr, wd, hidden_dim, n_rnn_layers, rnn_drop, emb_drop_ps, ff_layer_sizes, ff_drop_ps, conv_p):
        super().__init__()
        self.save_hyperparameters()

        unique_players, player_emb_sz = get_emb_params(player2num)
        self.player_emb = nn.Sequential(
            nn.Embedding(1188, 32),
            nn.Dropout(emb_drop_ps[0])
        )
        unique_teams, team_emb_sz = get_emb_params(teamid2num)
        self.team_emb = nn.Sequential(
            nn.Embedding(32, 8),
            nn.Dropout(emb_drop_ps[1])
        )
        unique_status, status_emb_sz = get_emb_params(status2num)
        self.status_emb = nn.Sequential(
            nn.Embedding(17, 6),
            nn.Dropout(emb_drop_ps[2])
        )
        unique_position, position_emb_sz = get_emb_params(position2num)
        self.position_emb = nn.Sequential(
            nn.Embedding(10, 4),
            nn.Dropout(emb_drop_ps[3])
        )

        cont_len = len([f for f in agg_cols]) + 2
        cat_len = 32 + 8 + 6 + 4
        self.cont_bn_in = nn.BatchNorm1d(cont_len)

        rnn_inp_len = len(scores_cols+long_games_features+standings_cols+season_stats_cols)
        self.rnn_emb = nn.Linear(rnn_inp_len, hidden_dim)
        self.rnn = nn.GRU(
            input_size=hidden_dim,
            num_layers=n_rnn_layers,
            hidden_size=hidden_dim,
            batch_first=True,
            dropout=rnn_drop,
        )

        self.conv1 = conv_layer(conv_p, 3)
        
        self.ff_layers1 = create_gru_head(ff_layer_sizes, ff_drop_ps, cont_len, cat_len, hidden_dim)
        self.ff_layers2 = create_gru_head(ff_layer_sizes, ff_drop_ps, cont_len, cat_len, hidden_dim)
        self.ff_layers3 = create_gru_head(ff_layer_sizes, ff_drop_ps, cont_len, cat_len, hidden_dim)
        self.ff_layers4 = create_gru_head(ff_layer_sizes, ff_drop_ps, cont_len, cat_len, hidden_dim)

        self.train_loss_fn = nn.L1Loss()
        self.val_loss_fn = nn.L1Loss()

    def forward(self, cont_feats, cat_feats, scores_lags, games_lags, standings_lags, season_stats_lags):
        x_player = self.player_emb(cat_feats[:, 0].long())
        x_team = self.team_emb(cat_feats[:, 1].long())
        x_status = self.status_emb(cat_feats[:, 2].long())
        x_position = self.position_emb(cat_feats[:, 3].long())

        bs = cont_feats.size(0)

        x_cont = self.cont_bn_in(cont_feats)
        x_static = torch.cat(
            (x_cont, x_player, x_team, x_status, x_position),
            dim=-1
        )

        x_scores = scores_lags.reshape(bs, len(LAGS)+1, len(scores_cols))
        x_scores = torch.flip(x_scores, dims=(1,))

        x_games = games_lags.reshape(bs, len(LAGS)+1, len(long_games_features))
        x_games = torch.flip(x_games, dims=(1,))

        x_standings = standings_lags.reshape(bs, len(LAGS)+1, len(standings_cols))
        x_standings = torch.flip(x_standings, dims=(1,))

        x_season_stats = season_stats_lags.reshape(bs, len(LAGS)+1, len(season_stats_cols))
        x_season_stats = torch.flip(x_season_stats, dims=(1,))

        x_lags = torch.cat((
            x_scores, 
            x_games,
            x_standings, 
            x_season_stats
        ), dim=-1)
        x_rnn = self.rnn_emb(x_lags)

        rnn_out, h_n = self.rnn(x_rnn)
        x = torch.cat((x_static, torch.mean(rnn_out, dim=1)), dim=-1)
        
        ar_out_cnn = self.conv1(rnn_out)

        ar_out = torch.cat((
            rnn_out[:, -1, :],
            torch.mean(rnn_out, dim=1),
            torch.mean(ar_out_cnn, dim=1),
        ), dim=-1)
        
        x = torch.cat((x_static, ar_out), dim=-1)
        return torch.cat((
            self.ff_layers1(x),
            self.ff_layers2(x),
            self.ff_layers3(x),
            self.ff_layers4(x),
        ), dim=-1)

        
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.wd)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5),
                'monitor': 'val_loss'
            }
        }

    def training_step(self, batch, batch_idx):
        cont_feats, cat_feats, scores_lags = batch['cont_feats'], batch['cat_feats'], batch['scores_lags']
        games_lags, standings_lags, season_stats_lags = batch['games_lags'], batch['standings_lags'], batch['season_stats_lags']
        y = batch['y']

        logits = self(
            cont_feats, cat_feats, scores_lags, 
            games_lags, standings_lags, season_stats_lags
        )
        loss = self.train_loss_fn(torch.clip(logits, min=0., max=100.), y)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        cont_feats, cat_feats, scores_lags = batch['cont_feats'], batch['cat_feats'], batch['scores_lags']
        games_lags, standings_lags, season_stats_lags = batch['games_lags'], batch['standings_lags'], batch['season_stats_lags']
        y = batch['y']

        logits = self(
            cont_feats, cat_feats, scores_lags, 
            games_lags, standings_lags, season_stats_lags
        )
        logits = torch.clip(logits, min=0., max=100.)
        loss_1 = self.val_loss_fn(logits[:, 0], y[:, 0])
        loss_2 = self.val_loss_fn(logits[:, 1], y[:, 1])
        loss_3 = self.val_loss_fn(logits[:, 2], y[:, 2])
        loss_4 = self.val_loss_fn(logits[:, 3], y[:, 3])
        loss = (loss_1 + loss_2 + loss_3 + loss_4) / 4

        self.log('target_1', loss_1, on_epoch=True, logger=True)
        self.log('target_2', loss_2, on_epoch=True, logger=True)
        self.log('target_3', loss_3, on_epoch=True, logger=True)
        self.log('target_4', loss_4, on_epoch=True, logger=True)
        self.log('val_loss', loss, on_epoch=True, prog_bar=True, logger=True)

    def predict_step(self, batch, batch_idx, dataloader_idx):
        cont_feats, cat_feats, scores_lags = batch['cont_feats'], batch['cat_feats'], batch['scores_lags']
        games_lags, standings_lags, season_stats_lags = batch['games_lags'], batch['standings_lags'], batch['season_stats_lags']

        logits = self(
            cont_feats, cat_feats, scores_lags, 
            games_lags, standings_lags, season_stats_lags
        )
        logits = torch.clip(logits, min=0., max=100.)

        return logits

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=len(LAGS)+1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class MLBTransformer(pl.LightningModule):
    def __init__(
        self, max_epochs, batch_size, lr, wd, hidden_dim, 
        dim_feedforward, nhead, encoder_drop, emb_drop_ps,
        num_layers, conv_drop_p, ff_layer_sizes, ff_drop
    ):
        super().__init__()
        self.save_hyperparameters()

        unique_players, player_emb_sz = get_emb_params(player2num)
        self.player_emb = nn.Sequential(
            nn.Embedding(1188, 32),
            nn.Dropout(emb_drop_ps[0])
        )
        unique_teams, team_emb_sz = get_emb_params(teamid2num)
        self.team_emb = nn.Sequential(
            nn.Embedding(32, 8),
            nn.Dropout(emb_drop_ps[1])
        )
        unique_status, status_emb_sz = get_emb_params(status2num)
        self.status_emb = nn.Sequential(
            nn.Embedding(17, 6),
            nn.Dropout(emb_drop_ps[2])
        )
        unique_position, position_emb_sz = get_emb_params(position2num)
        self.position_emb = nn.Sequential(
            nn.Embedding(10, 4),
            nn.Dropout(emb_drop_ps[3])
        )

        cont_len = len([f for f in agg_cols]) + 2
        cat_len = 32 + 8 + 6 + 4

        ts_len = len(scores_cols)
        self.enc_emb = nn.Linear(132, hidden_dim)
        self.mask = future_mask(len(LAGS)+1).to(self.device)

        self.pe = PositionalEncoding(hidden_dim).to(self.device)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=encoder_drop
        )
        self.transformer = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=num_layers
        )

        self.conv = conv_layer(0.1, 3)

        self.ff_layers1 = create_transformer_head(ff_layer_sizes, ff_drop, hidden_dim)
        self.ff_layers2 = create_transformer_head(ff_layer_sizes, ff_drop, hidden_dim)
        self.ff_layers3 = create_transformer_head(ff_layer_sizes, ff_drop, hidden_dim)
        self.ff_layers4 = create_transformer_head(ff_layer_sizes, ff_drop, hidden_dim)

        self.train_loss_fn = nn.L1Loss()
        self.val_loss_fn = nn.L1Loss()

    def forward(
        self, cont_feats, cat_feats, scores_lags, games_lags, standings_lags, season_stats_lags
    ):
        x_player = self.player_emb(cat_feats[:, 0].long())
        x_team = self.team_emb(cat_feats[:, 1].long())
        x_status = self.status_emb(cat_feats[:, 2].long())
        x_position = self.position_emb(cat_feats[:, 3].long())
        
        bs = cont_feats.size(0)

        x_scores = scores_lags.reshape(bs, len(LAGS)+1, len(scores_cols))
        x_scores = torch.flip(x_scores, dims=(1,))
        x_scores_lags = x_scores

        x_games = games_lags.reshape(bs, len(LAGS)+1, len(long_games_features))
        x_games = torch.flip(x_games, dims=(1,))
        x_games_lags = x_games

        x_standings = standings_lags.reshape(bs, len(LAGS)+1, len(standings_cols))
        x_standings = torch.flip(x_standings, dims=(1,))
        x_standings_lags = x_standings

        x_season_stats = season_stats_lags.reshape(bs, len(LAGS)+1, len(season_stats_cols))
        x_season_stats = torch.flip(x_season_stats, dims=(1,))
        x_season_stats_lags = x_season_stats

        inputs = self.enc_emb(torch.cat((
            torch.cat((x_player, x_team, x_status, x_position), dim=-1).unsqueeze(1).expand(-1, len(LAGS)+1, -1),
            cont_feats.unsqueeze(1).expand(-1, len(LAGS)+1, -1),
            x_scores_lags,
            x_games_lags,
            x_standings_lags,
            x_season_stats_lags
        ), dim=-1))

        inputs = self.pe(inputs.permute(1, 0, 2))
        x = self.transformer(src=inputs, mask=self.mask).permute(1,0,2)
        x = self.conv(x)
        x = torch.mean(x, dim=1)
        
        return torch.cat((
            self.ff_layers1(x),
            self.ff_layers2(x),
            self.ff_layers3(x),
            self.ff_layers4(x),
        ), dim=-1)
        
        
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.wd)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': torch.optim.lr_scheduler.CosineAnnealingLR(
                    optimizer, 
                    self.hparams.max_epochs,
                    eta_min=1e-5
                ),
                'interval': 'epoch',
                'monitor': 'val_loss'
            }
        }

    def training_step(self, batch, batch_idx):
        cont_feats, cat_feats, scores_lags = batch['cont_feats'], batch['cat_feats'], batch['scores_lags']
        games_lags, standings_lags, season_stats_lags = batch['games_lags'], batch['standings_lags'], batch['season_stats_lags']
        y = batch['y']

        logits = self(
            cont_feats, cat_feats, scores_lags, 
            games_lags, standings_lags, season_stats_lags
        )
        loss = self.train_loss_fn(torch.clip(logits, min=0., max=100.), y)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        cont_feats, cat_feats, scores_lags = batch['cont_feats'], batch['cat_feats'], batch['scores_lags']
        games_lags, standings_lags, season_stats_lags = batch['games_lags'], batch['standings_lags'], batch['season_stats_lags']
        y = batch['y']

        logits = self(
            cont_feats, cat_feats, scores_lags, 
            games_lags, standings_lags, season_stats_lags
        )
        logits = torch.clip(logits, min=0., max=100.)
        loss_1 = self.val_loss_fn(logits[:, 0], y[:, 0])
        loss_2 = self.val_loss_fn(logits[:, 1], y[:, 1])
        loss_3 = self.val_loss_fn(logits[:, 2], y[:, 2])
        loss_4 = self.val_loss_fn(logits[:, 3], y[:, 3])
        loss = (loss_1 + loss_2 + loss_3 + loss_4) / 4

        self.log('target_1', loss_1, on_epoch=True, logger=True)
        self.log('target_2', loss_2, on_epoch=True, logger=True)
        self.log('target_3', loss_3, on_epoch=True, logger=True)
        self.log('target_4', loss_4, on_epoch=True, logger=True)
        self.log('val_loss', loss, on_epoch=True, prog_bar=True, logger=True)

    def predict_step(self, batch, batch_idx, dataloader_idx):
        cont_feats, cat_feats, scores_lags = batch['cont_feats'], batch['cat_feats'], batch['scores_lags']
        games_lags, standings_lags, season_stats_lags = batch['games_lags'], batch['standings_lags'], batch['season_stats_lags']

        logits = self(
            cont_feats, cat_feats, scores_lags, 
            games_lags, standings_lags, season_stats_lags
        )
        logits = torch.clip(logits, min=0., max=100.)

        return logits

In [None]:
seeds = list(range(1, 11))
rnn_steps = [1499 + 200*h for h in range(0, 16)]
transformer_steps = [1499 + 200*h for h in range(0, 16)]
gru_models = [MLBGRU.load_from_checkpoint(f'../input/mlbplfinalmodels/final-ckpts/final-ckpts/rnn/rnn-model-seed{seed}-step_{step}.ckpt') for seed in seeds for step in rnn_steps]
transformer_models = [MLBTransformer.load_from_checkpoint(f'../input/mlbplfinalmodels/final-ckpts/final-ckpts/transformer/transformer-model-seed{seed}-step_{step}.ckpt') for seed in seeds for step in transformer_steps]
models = gru_models + transformer_models
scalers = joblib.load('../input/mlbplfinalmodels/rnn_scalers.pkl')

In [None]:
numerical_cols = [c for c in features if c not in categorical_cols]

In [None]:
import copy
import mlb

FE = []; SUB = [];
LAST = data.loc[data['gameDate'] > pd.to_datetime('2020-12-31'), :].copy()
last_cumul_df = cum_stats[cum_stats['gameDate'] == cum_stats['gameDate'].max()].copy()

null = np.nan
true = True
false = False

last_player_followers = pd.read_csv('../input/mlb-preprocessed-data/last_player_twitter_followers.csv').rename(columns={
    'numberOfFollowers': 'player_followers'
}).drop(columns='date')
last_team_followers = pd.read_csv('../input/mlb-preprocessed-data/last_team_twitter_followers.csv').rename(columns={
    'numberOfFollowers': 'team_followers'
}).drop(columns='date')
last_rosters = None
last_player_box_scores = None

player_target_stats = player_target_stats[
    (player_target_stats['lag_year'] == 2021) & (player_target_stats['lag_month'] == 8)
]

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test: # make predictions here
    
    sub = copy.deepcopy(sample_prediction_df.reset_index())
    test, eval_dt = test_lag(sub)
    
    test['gameDate'] = eval_dt
    test['was_2020'] = 0
    test['year'] = test['gameDate'].dt.year
    test['month'] = test['gameDate'].dt.month
    test['day'] = test['gameDate'].dt.day
    
    # Dealing with missing values
    if isinstance(test_df['rosters'].iloc[0], str):
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
        last_rosters = test_rosters
    else:
        test_rosters = pd.read_csv('../input/mlb-preprocessed-data/example_rosters.csv')
        for col in test_rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
        
    if test_df['games'].iloc[0] == test_df['games'].iloc[0]:
        test_games = pd.DataFrame(eval(test_df['games'].iloc[0]))
        test_games = process_games(test_games)
    else:
        test_games = pd.read_csv('../input/mlb-preprocessed-data/example_games.csv')
        test_games = process_games(test_games)
        for col in test_games.columns:
            if col == 'teamId': continue
            test_games[col] = np.nan
            
    if test_df['standings'].iloc[0] == test_df['standings'].iloc[0]:
        test_standings = pd.DataFrame(eval(test_df['standings'].iloc[0]))
    else:
        test_standings = pd.read_csv('../input/mlb-preprocessed-data/example_standings.csv')
        for col in test_standings.columns:
            if col == 'teamId': continue
            test_standings[col] = np.nan

    if test_df['transactions'].iloc[0] == test_df['transactions'].iloc[0]:
        test_transactions = pd.DataFrame(eval(test_df['transactions'].iloc[0]))
        test_transactions['gameDate'] = pd.to_datetime(test_transactions['date'])
        test_transactions['wasTraded'] = np.where(test_transactions['typeDesc'] == 'Trade', 1, 0)
        test_transactions['wasSigned'] = np.where(test_transactions['typeDesc'].isin(['Signed', 'Signed as Free Agent']), 1, 0)
        test_transactions = test_transactions[test_transactions['playerId'].notnull()]
    else:
        test_transactions = pd.read_csv('../input/mlb-preprocessed-data/example_transactions.csv')
        test_transactions['gameDate'] = pd.to_datetime(test_transactions['date'])
        test_transactions['wasTraded'] = np.where(test_transactions['typeDesc'] == 'Trade', 1, 0)
        test_transactions['wasSigned'] = np.where(test_transactions['typeDesc'].isin(['Signed', 'Signed as Free Agent']), 1, 0)
        test_transactions = test_transactions[test_transactions['playerId'].notnull()]
        for col in test_transactions.columns:
            if col == 'playerId': continue
            test_transactions[col] = np.nan
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    else:
        #test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        test_scores = pd.read_csv('../input/mlb-preprocessed-data/example_player_box_scores.csv')
        for col in test_scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan
            
    if test_df['playerTwitterFollowers'].iloc[0] == test_df['playerTwitterFollowers'].iloc[0]:
        test_player_followers = pd.DataFrame(eval(test_df['playerTwitterFollowers'].iloc[0])).drop(columns='date')
        test_player_followers.rename(columns={
            'numberOfFollowers': 'player_followers'
        }, inplace=True)
        last_player_followers = test_player_followers
    else:
        test_player_followers = last_player_followers

    if test_df['teamTwitterFollowers'].iloc[0] == test_df['teamTwitterFollowers'].iloc[0]:
        test_team_followers = pd.DataFrame(eval(test_df['teamTwitterFollowers'].iloc[0])).drop(columns='date')
        test_team_followers.rename(columns={
            'numberOfFollowers': 'team_followers'
        }, inplace=True)
        last_team_followers = test_team_followers
    else:
        test_team_followers = last_team_followers
            
    test_scores = test_scores.groupby('playerId').sum().reset_index()
    #test_scores = test_scores.merge(test_player_followers, how='left', on='playerId')
    
    test = test.merge(players[players_cols], on='playerId', how='left')
    test = test.merge(test_rosters[rosters_cols].drop(columns='gameDate'), on='playerId', how='left')
    test = test.merge(test_scores[['playerId'] + scores_cols], on='playerId', how='left')
    test = test.merge(test_games.drop(columns='gameDate'), on='teamId', how='left')
    test = test.merge(test_standings[['teamId'] + standings_cols], on='teamId', how='left')
    test = test.merge(test_transactions[['playerId', 'wasSigned', 'wasTraded']].drop_duplicates(subset=['playerId']), on='playerId', how='left')
    test = test.merge(player_target_stats, how='left', on='playerId')
    test = test.merge(test_player_followers, how='left', on='playerId')
    test = test.merge(test_team_followers, how='left', on='teamId')
    test.drop(columns=['lag_year', 'lag_month'], inplace=True)
    
    test_cum_stats = last_cumul_df.merge(test_scores[['playerId'] + scores_cols], on='playerId', how='left').fillna(0)
    for c in scores_cols:
        test_cum_stats[f'{c}_season'] = test_cum_stats[f'{c}_season'] + test_cum_stats[c]
    test_cum_stats = test_cum_stats[['playerId'] + season_stats_cols]
    last_cumul_df = test_cum_stats.copy()
    test = test.merge(test_cum_stats, on='playerId', how='left')
    
    test['label_playerId'] = test['playerId'].fillna(-999).map(player2num)
    test['label_primaryPositionName'] = test['primaryPositionName'].fillna(-999).map(position2num)
    test['label_teamId'] = test['teamId'].fillna(-999).map(teamid2num).fillna(teamid2num[-999])
    test['label_status'] = test['status'].fillna(-999).map(status2num).fillna(status2num[-999])
    
    X_test = test.loc[:, features]
    
    X_test = X_test.fillna(0)
    for i, scaler in enumerate(scalers):
        X_test[numerical_cols[i]] = scaler.transform(X_test[numerical_cols[i]].values.reshape(-1,1))
    dm = MLBDataModule(X_test)
    #model.freeze();
    with torch.no_grad():
        preds = []
        for batch in dm.predict_dataloader():
            cont_feats, cat_feats, scores_lags = batch['cont_feats'], batch['cat_feats'], batch['scores_lags']
            games_lags, standings_lags, season_stats_lags = batch['games_lags'], batch['standings_lags'], batch['season_stats_lags']
            model_preds = 0.
            for m in models:
                m.freeze();
                model_preds += m(
                    cont_feats, cat_feats, scores_lags, 
                    games_lags, standings_lags, season_stats_lags
                ).detach().cpu() / len(models)
            preds.append(model_preds)
        
        if len(preds) > 1:
            preds = torch.cat(*preds, dim=0).numpy()
        else:
            preds = preds[0].numpy()
        
    test[targets] = np.clip(preds, 0, 100)
    
    sub.drop(["date"]+TGTCOLS, axis=1, inplace=True)
    sub = sub.merge(test[['playerId']+targets], on="playerId", how="left")
    sub.drop("playerId", axis=1, inplace=True)
    sub = sub.fillna(0.)

    env.predict(sub)
    
    LAST = LAST.append(test.fillna(0))
    LAST = LAST.drop_duplicates(subset=["gameDate","playerId"], keep="last")

In [None]:
test_cum_stats

In [None]:
test.head()

In [None]:
LAST

In [None]:
sub.head()

In [None]:
LAST.tail()