In [None]:
import pandas as pd
import numpy as np
import gc
import lightgbm as lgbm

from matplotlib import pyplot as plt 

from sklearn.metrics import mean_absolute_error

from datetime import datetime, timedelta
from tqdm.auto import tqdm

# Create Unnested Dataset

In [None]:
df_train = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/train_updated.csv')
print(df_train.shape)
df_train.head()

In [None]:
df_train.info()

In [None]:
def json_to_df(df, column):
    num_rows = len(df)
    
    data_list = []
    for row in tqdm(range(num_rows)):
        
        json_data = df.iloc[row][column]
        if str(json_data) != "nan":
            data = pd.read_json(json_data)
            data_list.append(data)
        
    all_data = pd.concat(data_list, axis = 0)
    
    return all_data

In [None]:
player_engagement = json_to_df(df_train, 'nextDayPlayerEngagement')
player_engagement.insert(0, 'date', pd.to_datetime(player_engagement['engagementMetricsDate'])-\
                                                   timedelta(days=1))
player_engagement['engagementMetricsDate'] = pd.to_datetime(player_engagement['engagementMetricsDate'])
player_engagement.reset_index(drop=True, inplace=True)
print(player_engagement.shape)
player_engagement.head()

In [None]:
player_engagement[['target1','target2','target3','target4']] = player_engagement[['target1','target2','target3','target4']].astype(np.float16)

# Create Lag Features

In [None]:
lag = 7

lag_df = player_engagement.loc[player_engagement['date'] >= player_engagement.loc[0, 'date'] +\
                               timedelta(lag)]

for x in tqdm(range(1, (lag+1))):
    drop_columns = [f'date_{x}', f'engagementMetricsDate_{x}']
    lag_df = lag_df.merge(player_engagement, how='left', 
                          left_on=['date', 'playerId'],
                          right_on=['engagementMetricsDate', 'playerId'],
                          suffixes=['',f'_{x}'])
    lag_df.drop(columns=drop_columns, inplace=True)
    lag_df['date'] = lag_df['date'] - timedelta(days=1)
    
lag_df['date'] = lag_df['date'] + timedelta(days=lag)
lag_df = lag_df.drop(columns=['engagementMetricsDate'])
lag_df = lag_df.dropna()
lag_df.head()

In [None]:
feature_columns = [x for x in lag_df.columns[6:]]
feature_columns

In [None]:
lag_df.info()

In [None]:
lag_df = lag_df.sort_values(by=['date','playerId']).reset_index(drop=True)
lag_df.head()

# Create Descriptive Statistics Based on Lag Features

In [None]:
for x in range(4):
    columns = [f'target{x+1}_{i+1}' for i in range(lag)]
    lag_df[f'target{x+1}_median'] = lag_df[columns].median(axis=1).astype(np.float32)
    lag_df[f'target{x+1}_mean'] = lag_df[columns].mean(axis=1).astype(np.float32)
    lag_df[f'target{x+1}_max'] = lag_df[columns].max(axis=1).astype(np.float32)
    lag_df[f'target{x+1}_min'] = lag_df[columns].min(axis=1).astype(np.float32)
    lag_df[f'target{x+1}_lower_quartile'] = lag_df[columns].quantile(0.25, axis=1).astype(np.float32)
    lag_df[f'target{x+1}_upper_quartile'] = lag_df[columns].quantile(0.75, axis=1).astype(np.float32)
    lag_df[f'target{x+1}_skewness'] = lag_df[columns].skew(axis=1).astype(np.float32)
    lag_df = lag_df.drop(columns=columns)

In [None]:
lag_df.head()

In [None]:
lag_df.shape

In [None]:
target_columns = [x for x in lag_df.columns[2:6]]
target_columns

In [None]:
feature_columns = [x for x in lag_df.columns[6:]]
feature_columns

# Train LightGBM Model

In [None]:
def lgbm_fit(X_train, y_train, X_val, y_val, params):
    model = lgbm.LGBMRegressor(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100, 
        verbose=100
    )
    
    pred = model.predict(X_val)
    
    score = mean_absolute_error(pred, y_val)
    
    return model, score

In [None]:
# create training and validation dataset for training 
train_index = lag_df.loc[lag_df['date']<datetime(2021,5,1), feature_columns].index.to_numpy()
val_index = lag_df.loc[lag_df['date']>=datetime(2021,5,1), feature_columns].index.to_numpy()

X_train = lag_df.loc[train_index, feature_columns].to_numpy()
y_train = lag_df.loc[train_index, target_columns]
    
X_val = lag_df.loc[val_index, feature_columns].to_numpy()
y_val = lag_df.loc[val_index, target_columns]

Some parameters values below are copy-pasted from this [notebook](https://www.kaggle.com/lhagiimn/lightgbm-catboost-ann-2505f2) by [lhagiimn](https://www.kaggle.com/lhagiimn) at cell 14.

In [None]:
params = {
    'boosting_type': 'gbrt',
    'objective':'mae',
#     'subsample': 0.5,
#     'subsample_freq': 1,
    'learning_rate': 0.03,
    'num_leaves': 2**11-1,
    'min_data_in_leaf': 2**12-1,
#     'feature_fraction': 0.5,
    'max_bin': 200,
    'n_estimators': 2500,
#     'boost_from_average': False,
    "random_seed":42,
}

lgbm_model1, score1 = lgbm_fit(X_train, y_train['target1'], X_val, y_val['target1'], params)

lgbm_model2, score2 = lgbm_fit(X_train, y_train['target2'], X_val, y_val['target2'], params)

lgbm_model3, score3 = lgbm_fit(X_train, y_train['target3'], X_val, y_val['target3'], params)

lgbm_model4, score4 = lgbm_fit(X_train, y_train['target4'], X_val, y_val['target4'], params)

score = (score1+score2+score3+score4)/4
print(f'Overall MAE Score:{score}')

#0.7293

# Plotting Prediction Result

In [None]:
def plot_target_pred(playerId, model, target=1):
    x1 = lag_df[lag_df['playerId'].isin([playerId])]
    x2 = x1[x1['date']>=datetime(2021,5,1)]
    
    pred = model.predict(x2[feature_columns])
    
    plt.figure(figsize=(20,6))
    plt.plot(x1['date'], x1[f'target{target}'])
    plt.plot(x2['date'], pred)
    plt.xlabel(f'date')
    plt.ylabel(f'target')
    plt.title(f'playerId {playerId}')
    plt.legend([f'target{target}', 'pred'], loc='upper right')
    plt.show()

In [None]:
# Plotting prediction result
# for example, playerId = 593590
playerId = 593590
plot_target_pred(playerId, lgbm_model1, target=1)
plot_target_pred(playerId, lgbm_model2, target=2)
plot_target_pred(playerId, lgbm_model3, target=3)
plot_target_pred(playerId, lgbm_model4, target=4)

# Target Inference

In [None]:
def prediction(df):
    df = df.reset_index()
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['playerId'] = df['date_playerId'].apply(lambda x: x.split('_')[1]).astype(int)
    
    for x in range(lag):
        df['date'] = df['date'] - timedelta(days=1)
        df = df.merge(player_engagement, how='left', on=['date', 'playerId'], suffixes=['',f'_{x+1}'])
        df = df.fillna(0.)
    
    for x in range(4):
        columns = [f'target{x+1}_{i+1}' for i in range(lag)]
        df[f'target{x+1}_median'] = df[columns].median(axis=1)
        df[f'target{x+1}_mean'] = df[columns].mean(axis=1)
        df[f'target{x+1}_max'] = df[columns].max(axis=1)
        df[f'target{x+1}_min'] = df[columns].min(axis=1)
        df[f'target{x+1}_lower_quartile'] = df[columns].quantile(0.25, axis=1)
        df[f'target{x+1}_upper_quartile'] = df[columns].quantile(0.75, axis=1)
        df[f'target{x+1}_skewness'] = df[columns].skew(axis=1)
        df = df.drop(columns=columns)
        
    target1_pred = lgbm_model1.predict(df[feature_columns].to_numpy())
    target2_pred = lgbm_model2.predict(df[feature_columns].to_numpy())
    target3_pred = lgbm_model3.predict(df[feature_columns].to_numpy())
    target4_pred = lgbm_model4.predict(df[feature_columns].to_numpy())
    
    return target1_pred, target2_pred, target3_pred, target4_pred

In [None]:
player_engagement = player_engagement.drop(columns=['engagementMetricsDate'])

In [None]:
import mlb

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test:
    target1, target2, target3, target4 = prediction(sample_prediction_df)
    sample_prediction_df['target1'] = np.clip(target1, 0, 100)
    sample_prediction_df['target2'] = np.clip(target2, 0, 100)
    sample_prediction_df['target3'] = np.clip(target3, 0, 100)
    sample_prediction_df['target4'] = np.clip(target4, 0, 100)
    env.predict(sample_prediction_df)