In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime, timedelta
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit,cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.multioutput import MultiOutputRegressor
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Dropout,LSTM,Bidirectional
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def json_to_df(df, column):
    num_rows = len(df)
    
    data_list = []
    for row in tqdm(range(num_rows)):
        
        json_data = df.iloc[row][column]
        if str(json_data) != "nan":
            data = pd.read_json(json_data)
            data_list.append(data)
        
    all_data = pd.concat(data_list, axis = 0)
    
    return all_data

def create_model(input_shape):
    inputs = Input(shape=input_shape)
    
   # x = Dense(320, activation='relu')(inputs)
   # x = Dropout(0.5)(x)
    x = Dense(120, activation='relu')(inputs)#(x)
    x = Dropout(0.4)(x)
    x = Dense(40, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(20, activation='relu')(x)
    outputs = Dense(4, activation='relu')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    
#     model.summary()
    
    return model

def prediction(df):
    df = df.reset_index()
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['playerId'] = df['date_playerId'].apply(lambda x: x.split('_')[1]).astype(int)
    
    for x in range(lag):
        df['date'] = df['date'] - timedelta(days=1)
        df = df.merge(player_engagement, how='left', on=['date', 'playerId'], suffixes=['',f'_{x+1}'])
        df = df.fillna(0.)
    
    for x in range(4):
        columns = [f'target{x+1}_{i+1}' for i in range(lag)]
        df[f'target{x+1}_mean'] = df[columns].mean(axis=1)
        df[f'target{x+1}_median'] = df[columns].median(axis=1)
        df[f'target{x+1}_std'] = df[columns].std(axis=1)
        df[f'target{x+1}_lower_quartile'] = df[columns].quantile(0.25, axis=1)
        df[f'target{x+1}_upper_quartile'] = df[columns].quantile(0.75, axis=1)
        df[f'target{x+1}_IQR'] = df[f'target{x+1}_upper_quartile'] - df[f'target{x+1}_lower_quartile']
        df = df.drop(columns=columns)
        
    pred = np.zeros(df[target_columns].shape)
    
    for x in range(splits):
        best_model = load_model(f'./best_model_split{x+1}.h5')
        pred += best_model.predict(df[feature_columns].to_numpy()) / splits
    
    return pred

In [None]:
players = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/players.csv",
                      parse_dates = ["DOB","mlbDebutDate"],infer_datetime_format = True)

In [None]:
train = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/train.csv")

In [None]:
player_engagement = json_to_df(train, 'nextDayPlayerEngagement')
player_engagement.insert(0, 'date', pd.to_datetime(player_engagement['engagementMetricsDate'])-\
                                                   timedelta(days=1))
player_engagement['engagementMetricsDate'] = pd.to_datetime(player_engagement['engagementMetricsDate'])
player_engagement.reset_index(drop=True, inplace=True)
print(player_engagement.shape)
player_engagement.head()

In [None]:
player_engagement[['target1','target2','target3','target4']] = player_engagement[['target1','target2',
                                                                                  'target3','target4']].astype(np.float16)


In [None]:
lag = 100

lag_df = player_engagement.loc[player_engagement['date'] >= player_engagement.loc[0, 'date'] +\
                               timedelta(lag)]

for x in tqdm(range(1, (lag+1))):
    drop_columns = [f'date_{x}', f'engagementMetricsDate_{x}']
    lag_df = lag_df.merge(player_engagement, how='left', 
                          left_on=['date', 'playerId'],
                          right_on=['engagementMetricsDate', 'playerId'],
                          suffixes=['',f'_{x}'])
    lag_df.drop(columns=drop_columns, inplace=True)
    lag_df['date'] = lag_df['date'] - timedelta(days=1)
    
lag_df['date'] = lag_df['date'] + timedelta(days=lag)
lag_df = lag_df.drop(columns=['engagementMetricsDate'])
lag_df = lag_df.dropna()
lag_df.head()

In [None]:
feature_columns = [x for x in lag_df.columns[6:]]

In [None]:
lag_df = lag_df.sort_values(by=['date','playerId']).reset_index(drop=True)
lag_df.head()

In [None]:
plt.figure(figsize = (16,9))
for i in tqdm(lag_df.index[0:20]):
    print(lag_df.loc[i]["playerId"])
    plt.plot(lag_df.loc[i][feature_columns].tolist())

In [None]:
for x in tqdm(range(4)):
    columns = [f'target{x+1}_{i+1}' for i in range(lag)]
    lag_df[f'target{x+1}_mean'] = lag_df[columns].mean(axis=1).astype(np.float32)
    lag_df[f'target{x+1}_median'] = lag_df[columns].median(axis=1).astype(np.float32)
    lag_df[f'target{x+1}_std'] = lag_df[columns].std(axis=1).astype(np.float32)
    lag_df[f'target{x+1}_lower_quartile'] = lag_df[columns].quantile(0.25, axis=1).astype(np.float32)
    lag_df[f'target{x+1}_upper_quartile'] = lag_df[columns].quantile(0.75, axis=1).astype(np.float32)
    lag_df[f'target{x+1}_IQR'] = lag_df[f'target{x+1}_upper_quartile'] - lag_df[f'target{x+1}_lower_quartile']
    lag_df = lag_df.drop(columns=columns)

In [None]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,StackingRegressor

In [None]:
estimators = [
     ('rf', RandomForestRegressor(n_estimators=10,criterion = "mae")),
     #('xgb',XGBRegressor(n_estimators = 100)),
    ('lgb',LGBMRegressor(n_estimators = 10,max_depth = 10)),
    ("lr", LinearRegression())
 ]
reg = MultiOutputRegressor(StackingRegressor(
     estimators=estimators,
     final_estimator=XGBRegressor(n_estimators = 100)))

In [None]:
target_columns = [x for x in lag_df.columns[2:6]]
feature_columns = [x for x in lag_df.columns[6:]]

In [None]:
from sklearn.preprocessing import MinMaxScaler
splits = 5

tss = TimeSeriesSplit(n_splits=splits)

split = 1

for train_index, val_index in tqdm(tss.split(lag_df)):
    plt.figure(figsize = (16,9))
    X_train = lag_df.loc[train_index, feature_columns].to_numpy()
    y_train = lag_df.loc[train_index, target_columns].to_numpy()
    print(X_train.shape,y_train.shape)
    
    X_val =  lag_df.loc[val_index, feature_columns]
    y_val = lag_df.loc[val_index, target_columns].to_numpy()

    input_shape = (X_train.shape[1],)
    
    model = create_model(input_shape)
    model.compile(
        optimizer='adam',
        loss='mean_absolute_error'
    )
    #     es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    mc = ModelCheckpoint(f'best_model_split{split}.h5', monitor='val_loss', mode='min',
                         save_best_only=True,verbose=1)

    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_val, y_val),
                        epochs=10,
                        batch_size=30000,
                        callbacks=[mc])

    plt.figure()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.ylabel('loss')
    plt.xlabel('epochs')
    plt.title(f'Training-Validation Loss Split-{split}')
    plt.legend(['train_loss', 'val_loss'], loc='upper right')
    plt.show()
    
    split += 1


In [None]:
player_engagement = player_engagement.drop(columns=['engagementMetricsDate'])

In [None]:
import mlb
lag = 100
splits = 10
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test:
    targets = prediction(sample_prediction_df)
    sample_prediction_df[target_columns] = np.clip(targets, 0, 100)
    env.predict(sample_prediction_df)