In [None]:
import pandas as pd
import numpy as np
import gc
import tensorflow as tf

from matplotlib import pyplot as plt 

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

from datetime import datetime, timedelta
from tqdm.auto import tqdm

In [None]:
df_train = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/train.csv')
print(df_train.shape)
df_train.head()

In [None]:
df_train.info()

In [None]:
def json_to_df(df, column):
    num_rows = len(df)
    
    data_list = []
    for row in tqdm(range(num_rows)):
        
        json_data = df.iloc[row][column]
        if str(json_data) != "nan":
            data = pd.read_json(json_data)
            data_list.append(data)
        
    all_data = pd.concat(data_list, axis = 0)
    
    return all_data

In [None]:
player_engagement = json_to_df(df_train, 'nextDayPlayerEngagement')
player_engagement.insert(0, 'date', pd.to_datetime(player_engagement['engagementMetricsDate'])-\
                                                   timedelta(days=1))
player_engagement['engagementMetricsDate'] = pd.to_datetime(player_engagement['engagementMetricsDate'])
player_engagement.reset_index(drop=True, inplace=True)
print(player_engagement.shape)
player_engagement.head()

In [None]:
player_engagement[['target1','target2','target3','target4']] = player_engagement[['target1','target2','target3','target4']].astype(np.float16)

In [None]:
playerBoxScores = json_to_df(df_train, 'playerBoxScores')
playerBoxScores = playerBoxScores.reset_index(drop=True)
playerBoxScores.insert(0, 'date', pd.to_datetime(playerBoxScores['gameDate']))
playerBoxScores = playerBoxScores.drop(columns=['gameDate'])
print(playerBoxScores.shape)
playerBoxScores.head()

In [None]:
playerBoxScores_columns = ['date',
                           'playerId',
                           'homeRuns',
                           'rbi',
                           'atBats',
                           'stolenBases',
                           'hits',
                           'runsScored',
                           'earnedRuns',
                           'hitsPitching',
                           'intentionalWalksPitching',
                           'strikeOuts',
                           'saves'
                          ]

In [None]:
lag = 100

lag_df = player_engagement.loc[player_engagement['date'] >= player_engagement.loc[0, 'date'] +\
                               timedelta(lag)]

for x in tqdm(range(1, (lag+1))):
    drop_columns = [f'date_{x}', f'engagementMetricsDate_{x}']
    lag_df = lag_df.merge(player_engagement, how='left', 
                          left_on=['date', 'playerId'],
                          right_on=['engagementMetricsDate', 'playerId'],
                          suffixes=['',f'_{x}'])
    lag_df.drop(columns=drop_columns, inplace=True)
    lag_df['date'] = lag_df['date'] - timedelta(days=1)
    
lag_df['date'] = lag_df['date'] + timedelta(days=lag)
lag_df = lag_df.drop(columns=['engagementMetricsDate'])
lag_df = lag_df.dropna()
lag_df.head()

In [None]:
feature_columns = [x for x in lag_df.columns[6:]]
feature_columns

In [None]:
lag_df.info()

In [None]:
lag_df = lag_df.sort_values(by=['date','playerId']).reset_index(drop=True)
lag_df.head()

In [None]:
for x in range(4):
    columns = [f'target{x+1}_{i+1}' for i in range(lag)]
    lag_df[f'target{x+1}_median'] = lag_df[columns].median(axis=1).astype(np.float32)
    lag_df = lag_df.drop(columns=columns)

In [None]:
lag_df = lag_df.merge(playerBoxScores[playerBoxScores_columns], how='left', on=['date', 'playerId'])
lag_df = lag_df.fillna(0.)

In [None]:
lag_df.head()

In [None]:
lag_df.shape

In [None]:
def create_model(input_shape):
    inputs = Input(shape=input_shape)
    
    x = Dense(50, activation='relu')(inputs)
    x = Dropout(0.2)(x)
    x = Dense(50, activation='relu')(x)
    x = Dropout(0.2)(x)

    outputs = Dense(4, activation='linear')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    
#     model.summary()
    
    return model

In [None]:
target_columns = [x for x in lag_df.columns[2:6]]
target_columns

In [None]:
feature_columns = [x for x in lag_df.columns[6:]]
feature_columns

In [None]:
# scaler = StandardScaler()

splits = 10

tss = TimeSeriesSplit(n_splits=splits)

split = 1

for train_index, val_index in tss.split(lag_df):
    X_train = lag_df.loc[train_index, feature_columns].to_numpy()
#     X_train = scaler.fit_transform(X_train)
    y_train = lag_df.loc[train_index, target_columns].to_numpy()
    
    X_val = lag_df.loc[val_index, feature_columns].to_numpy()
#     X_val = scaler.fit_transform(X_val)
    y_val = lag_df.loc[val_index, target_columns].to_numpy()
    
    input_shape = (X_train.shape[1],)
    
    model = create_model(input_shape)

    model.compile(
        optimizer='rmsprop',
        loss='mean_absolute_error'
    )

#     es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    mc = ModelCheckpoint(f'best_model_split{split}.h5', monitor='val_loss', mode='min',
                         save_best_only=True,verbose=1)

    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_val, y_val),
                        epochs=10,
                        batch_size=30_000,
                        callbacks=[mc])

    plt.figure()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.ylabel('loss')
    plt.xlabel('epochs')
    plt.title(f'Training-Validation Loss Split-{split}')
    plt.legend(['train_loss', 'val_loss'], loc='upper right')
    plt.show()
    
    split += 1

In [None]:
def prediction(df, test_df):
    df = df.reset_index()
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['playerId'] = df['date_playerId'].apply(lambda x: x.split('_')[1]).astype(int)
    
    for x in range(lag):
        df['date'] = df['date'] - timedelta(days=1)
        df = df.merge(player_engagement, how='left', on=['date', 'playerId'], suffixes=['',f'_{x+1}'])
        df = df.fillna(0.)
    
    for x in range(4):
        columns = [f'target{x+1}_{i+1}' for i in range(lag)]
        df[f'target{x+1}_median'] = df[columns].median(axis=1)
        df = df.drop(columns=columns)
    
    pbs_test = json_to_df(test_df, 'playerBoxScores')
    pbs_test = pbs_test.reset_index(drop=True)
    pbs_test.insert(0, 'date', pd.to_datetime(pbs_test['gameDate']))
    pbs_test = pbs_test.drop(columns=['gameDate'])
    
    df = df.merge(pbs_test[playerBoxScores_columns], how='left', on=['date', 'playerId'])
    df = df.fillna(0.)
    
    pred = np.zeros(df[target_columns].shape)
    
    for x in range(splits):
        best_model = load_model(f'./best_model_split{x+1}.h5')
        pred += best_model.predict(df[feature_columns].to_numpy()) / splits
    
    return pred

In [None]:
player_engagement = player_engagement.drop(columns=['engagementMetricsDate'])

In [None]:
import mlb

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test:
    targets = prediction(sample_prediction_df, test_df)
    sample_prediction_df[target_columns] = np.clip(targets, 0, 100)
    env.predict(sample_prediction_df)