In [28]:
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm import tqdm
import gc
from functools import reduce
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle


In [2]:
def make_df(df, col, bool_in=False):
    tp = df.loc[ ~df[col].isnull() ,[col]].copy()
    df.drop(col, axis=1, inplace=True)
    
    tp[col] = tp[col].str.replace("null",'""')
    if bool_in:
        tp[col] = tp[col].str.replace("false",'"False"')
        tp[col] = tp[col].str.replace("true",'"True"')
    tp[col] = tp[col].apply(lambda x: eval(x) )
    a = tp[col].sum()
    gc.collect()
    return pd.DataFrame(a)
#===============

In [26]:
EXP_NUM = 6

In [4]:

DATA_DIR = Path("/home/knikaido/work/MLB-Player-Digital-Engagement-Forecasting/data/")
MAIN_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting'
TRAIN_DIR = MAIN_DATA_DIR / 'train'
OUTPUT_DIR = Path('./output/')

ROOT_DIR = MAIN_DATA_DIR


## UTILITY FUNCTIONS

In [5]:
#=======================#
def flatten(df, col):
    du = (df.pivot(index="playerId", columns="EvalDate", 
               values=col).add_prefix(f"{col}_").
      rename_axis(None, axis=1).reset_index())
    return du
#============================#
def reducer(left, right):
    return left.merge(right, on="playerId")
#========================

In [6]:
TGTCOLS = ["target1","target2","target3","target4"]
def train_lag(df, lag=1):
    dp = df[["playerId","EvalDate"]+TGTCOLS].copy()
    dp["EvalDate"]  =dp["EvalDate"] + timedelta(days=lag) 
    df = df.merge(dp, on=["playerId", "EvalDate"], suffixes=["",f"_{lag}"], how="left")
    return df
#=================================
def test_lag(sub):
    sub["playerId"] = sub["date_playerId"].apply(lambda s: int(  s.split("_")[1]  ) )
    assert sub.date.nunique() == 1
    dte = sub["date"].unique()[0]
    
    eval_dt = pd.to_datetime(dte, format="%Y%m%d")
    dtes = [eval_dt + timedelta(days=-k) for k in LAGS]
    mp_dtes = {eval_dt + timedelta(days=-k):k for k in LAGS}
    
    sl = LAST.loc[LAST.EvalDate.between(dtes[-1], dtes[0]), ["EvalDate","playerId"]+TGTCOLS].copy()
    sl["EvalDate"] = sl["EvalDate"].map(mp_dtes)
    du = [flatten(sl, col) for col in TGTCOLS]
    du = reduce(reducer, du)
    return du, eval_dt
    #
#===============

In [7]:
%%time
#tr = pd.read_csv(f"{ROOT_DIR}/train.csv")
tr = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv')
print(tr.shape)
gc.collect()

(2506176, 8)
CPU times: user 1.16 s, sys: 71.4 ms, total: 1.23 s
Wall time: 1.23 s


0

In [8]:
tr

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,index,date
0,2018-01-02,628317,0.011167,4.474708,0.005168,5.735294,0,20180101
1,2018-01-02,547989,0.042993,5.593385,0.045033,2.794118,0,20180101
2,2018-01-02,519317,0.974327,56.177043,13.693746,64.166667,0,20180101
3,2018-01-02,607625,0.006700,2.675097,0.005168,1.862745,0,20180101
4,2018-01-02,592547,0.001117,0.632296,0.002953,0.931373,0,20180101
...,...,...,...,...,...,...,...,...
2506171,2021-05-01,451661,0.000000,0.013314,0.000000,0.625925,1215,20210430
2506172,2021-05-01,519301,0.000131,0.003329,0.000000,0.216229,1215,20210430
2506173,2021-05-01,527055,0.000000,0.019971,0.000000,0.273131,1215,20210430
2506174,2021-05-01,543484,0.000131,0.056586,0.000000,1.024240,1215,20210430


In [9]:
tr["EvalDate"] = pd.to_datetime(tr["engagementMetricsDate"])
tr["EvalDate"] = tr["EvalDate"] + timedelta(days=-1)
tr["EvalYear"] = tr["EvalDate"].dt.year

In [10]:
tr

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,index,date,EvalDate,EvalYear
0,2018-01-02,628317,0.011167,4.474708,0.005168,5.735294,0,20180101,2018-01-01,2018
1,2018-01-02,547989,0.042993,5.593385,0.045033,2.794118,0,20180101,2018-01-01,2018
2,2018-01-02,519317,0.974327,56.177043,13.693746,64.166667,0,20180101,2018-01-01,2018
3,2018-01-02,607625,0.006700,2.675097,0.005168,1.862745,0,20180101,2018-01-01,2018
4,2018-01-02,592547,0.001117,0.632296,0.002953,0.931373,0,20180101,2018-01-01,2018
...,...,...,...,...,...,...,...,...,...,...
2506171,2021-05-01,451661,0.000000,0.013314,0.000000,0.625925,1215,20210430,2021-04-30,2021
2506172,2021-05-01,519301,0.000131,0.003329,0.000000,0.216229,1215,20210430,2021-04-30,2021
2506173,2021-05-01,527055,0.000000,0.019971,0.000000,0.273131,1215,20210430,2021-04-30,2021
2506174,2021-05-01,543484,0.000131,0.056586,0.000000,1.024240,1215,20210430,2021-04-30,2021


In [11]:
MED_DF = tr.groupby(["playerId","EvalYear"])[TGTCOLS].median().reset_index()
MEDCOLS = ["tgt1_med","tgt2_med", "tgt3_med", "tgt4_med"]
MED_DF.columns = ["playerId","EvalYear"] + MEDCOLS

In [12]:
MED_DF.head()

Unnamed: 0,playerId,EvalYear,tgt1_med,tgt2_med,tgt3_med,tgt4_med
0,112526,2018,0.151508,4.901809,0.528752,13.437293
1,112526,2019,0.033293,1.117953,0.129707,10.568848
2,112526,2020,0.021525,1.468556,0.041698,8.448668
3,112526,2021,0.007505,0.477795,0.043267,9.549732
4,134181,2018,0.706118,5.399749,0.733436,6.923528


In [13]:
LAGS = list(range(1,31))
FECOLS = [f"{col}_{lag}" for lag in reversed(LAGS) for col in TGTCOLS]

In [14]:
# LAGS

In [15]:
%%time
for lag in tqdm(LAGS):
    tr = train_lag(tr, lag=lag)
    gc.collect()
#===========
tr = tr.sort_values(by=["playerId", "EvalDate"])
print(tr.shape)
tr = tr.dropna()
print(tr.shape)
tr = tr.merge(MED_DF, on=["playerId","EvalYear"])
gc.collect()

100%|██████████| 30/30 [00:46<00:00,  1.56s/it]


(2506176, 130)
(2444346, 130)
CPU times: user 39.2 s, sys: 13 s, total: 52.2 s
Wall time: 52.1 s


26

In [16]:
tr

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,index,date,EvalDate,EvalYear,...,target3_29,target4_29,target1_30,target2_30,target3_30,target4_30,tgt1_med,tgt2_med,tgt3_med,tgt4_med
0,2018-02-01,112526,0.012440,7.627666,0.154062,10.096154,30,20180131,2018-01-31,2018,...,0.030486,8.541353,0.055277,5.496109,0.025839,16.176471,0.151508,4.901809,0.528752,13.437293
1,2018-02-02,112526,0.022832,10.447761,0.544581,14.858590,31,20180201,2018-02-01,2018,...,0.032613,10.490112,0.060625,3.252914,0.030486,8.541353,0.151508,4.901809,0.528752,13.437293
2,2018-02-03,112526,3.464973,65.886940,29.350621,100.000000,32,20180202,2018-02-02,2018,...,0.087422,19.091467,0.029341,1.648352,0.032613,10.490112,0.151508,4.901809,0.528752,13.437293
3,2018-02-04,112526,100.000000,100.000000,57.540311,9.115387,33,20180203,2018-02-03,2018,...,0.024759,6.643880,0.014799,2.665894,0.087422,19.091467,0.151508,4.901809,0.528752,13.437293
4,2018-02-05,112526,35.323637,39.702176,19.891216,38.005663,34,20180204,2018-02-04,2018,...,0.035855,12.147134,0.083916,1.161002,0.024759,6.643880,0.151508,4.901809,0.528752,13.437293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2444341,2021-04-27,685503,0.044617,1.224728,0.009437,0.737463,1211,20210426,2021-04-26,2021,...,3.501559,2.694346,2.847250,48.264984,0.547288,6.944986,0.002099,2.916638,0.012967,0.932605
2444342,2021-04-28,685503,0.019123,1.178880,0.013161,0.790301,1212,20210427,2021-04-27,2021,...,2.089865,1.529100,5.333101,6.400267,3.501559,2.694346,0.002099,2.916638,0.012967,0.932605
2444343,2021-04-29,685503,0.015799,4.323489,0.002350,0.970273,1213,20210428,2021-04-28,2021,...,3.089282,0.741931,1.282354,8.539095,2.089865,1.529100,0.002099,2.916638,0.012967,0.932605
2444344,2021-04-30,685503,0.018770,31.946021,0.305491,5.938273,1214,20210429,2021-04-29,2021,...,0.285499,0.693112,0.161984,14.469882,3.089282,0.741931,0.002099,2.916638,0.012967,0.932605


In [17]:
X = tr[FECOLS+MEDCOLS].values
y = tr[TGTCOLS].values
cl = tr["playerId"].values

In [18]:
NFOLDS = 5
skf = StratifiedKFold(n_splits=NFOLDS)
folds = skf.split(X, cl)
folds = list(folds)

In [19]:
X.shape

(2444346, 124)

In [20]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = lgbm.LGBMRegressor(**params)
    model.fit(x_train, y_train, 
        eval_set=[(x_valid, y_valid)],  
        early_stopping_rounds=verbose, 
        verbose=verbose)
    oof_pred = model.predict(x_valid)
    oof_pred = np.clip(oof_pred, 0, 100)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score

In [21]:
model1s = []
model2s = []
model3s = []
model4s = []
oof = np.zeros(y.shape)

for idx in range(NFOLDS):
    
    # training lightgbm
    params = {
     'objective':'mae',
     'reg_alpha': 0.1,
     'reg_lambda': 0.1, 
     'n_estimators': 100000,
     'learning_rate': 0.1,
     'random_state': 42,
    }
    tr_idx, val_idx = folds[idx]
    x_train = X[tr_idx]
    x_valid = X[val_idx]
    y_train = y[tr_idx]
    y_valid = y[val_idx]

    oof1, model1, score1 = fit_lgbm(
        x_train, y_train[:, 0],
        x_valid, y_valid[:, 0],
        params
    )
    oof2, model2, score2 = fit_lgbm(
        x_train, y_train[:, 1],
        x_valid, y_valid[:, 1],
        params
    )
    oof3, model3, score3 = fit_lgbm(
        x_train, y_train[:, 2],
        x_valid, y_valid[:, 2],
        params
    )
    oof4, model4, score4 = fit_lgbm(
        x_train, y_train[:, 3],
        x_valid, y_valid[:, 3],
        params
    )

    score = (score1+score2+score3+score4) / 4
    print(f'score: {score}')
    
    model1s.append(model1)
    model2s.append(model2)
    model3s.append(model3)
    model4s.append(model4)
    oof[val_idx, 0] = oof1
    oof[val_idx, 1] = oof2
    oof[val_idx, 2] = oof3
    oof[val_idx, 3] = oof4

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.607379
[200]	valid_0's l1: 0.607447
Early stopping, best iteration is:
[106]	valid_0's l1: 0.607359
mae: 0.6073572193697094
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 1.36072
[200]	valid_0's l1: 1.3503
[300]	valid_0's l1: 1.34674
[400]	valid_0's l1: 1.3458
[500]	valid_0's l1: 1.34563
Early stopping, best iteration is:
[469]	valid_0's l1: 1.3456
mae: 1.3455223287734053
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.725924
[200]	valid_0's l1: 0.725924
[300]	valid_0's l1: 0.725924
Early stopping, best iteration is:
[216]	valid_0's l1: 0.725924
mae: 0.7259240303796948
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 0.69389
[200]	valid_0's l1: 0.692213
[300]	valid_0's l1: 0.691714
[400]	valid_0's l1: 0.690949
[500]	valid_0's l1: 0.690897
Early stopping, best iteration is:
[436]	valid_0's l1: 0.690629

## Neural Net Training

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

In [None]:
def make_model(n_in):
    inp = L.Input(name="inputs", shape=(n_in,))
    x = L.Dense(50, activation="relu", name="d1")(inp)
    x = L.Dense(50, activation="relu", name="d2")(x)
    preds = L.Dense(4, activation="linear", name="preds")(x)
    
    model = M.Model(inp, preds, name="ANN")
    model.compile(loss="mean_absolute_error", optimizer="adam")
    return model

In [None]:
net = make_model(X.shape[1])
print(net.summary())

In [None]:
oof = np.zeros(y.shape)
nets = []
EPOCHS  = 10
for idx in range(NFOLDS):
    print("FOLD:", idx)
    tr_idx, val_idx = folds[idx]
    ckpt = ModelCheckpoint(f"w{idx}.h5", monitor='val_loss', verbose=1, save_best_only=True,mode='min')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=3, min_lr=0.0005)
    es = EarlyStopping(monitor='val_loss', patience=6)
    reg = make_model(X.shape[1])
    reg.fit(X[tr_idx], y[tr_idx], epochs=EPOCHS, batch_size=30_000, 
            validation_data=(X[val_idx], y[val_idx]),
            verbose=1, callbacks=[ckpt, reduce_lr, es])
    reg.load_weights(f"w{idx}.h5")
    oof[val_idx] = reg.predict(X[val_idx], batch_size=50_000, verbose=1)
    nets.append(reg)
    gc.collect()
    #
#

In [None]:
#reg.fit(X, y, epochs=10, batch_size=30_000, validation_split=0.3)

In [23]:
mae = mean_absolute_error(y, oof)
# mse = mean_squared_error(y, oof, squared=False)
print("mae:", mae)
# print("mse:", mse)

mae: 0.7551545507640949


In [35]:
oof_df = tr[TGTCOLS+['playerId', 'date']]

In [37]:
oof_df.loc[:, TGTCOLS] = oof

In [38]:
oof_df.to_csv(OUTPUT_DIR / f'oof{EXP_NUM}.csv', index=False)

In [29]:
models = np.array([model1s, model2s, model3s, model4s])
with open(OUTPUT_DIR / f"models{EXP_NUM}.pickle", mode="wb") as f:
    pickle.dump(models, f)

In [None]:
# Historical information to use in prediction time
bound_dt = pd.to_datetime("2021-01-01")
LAST = tr.loc[tr.EvalDate>bound_dt].copy()

In [None]:
LAST

In [None]:
LAST_MED_DF = MED_DF.loc[MED_DF.EvalYear==2021].copy()
LAST_MED_DF.drop("EvalYear", axis=1, inplace=True)
del tr

In [None]:
LAST.shape, LAST_MED_DF.shape, MED_DF.shape

In [None]:
#nets[0].summary()

In [None]:
#"""
import mlb
FE = []; SUB = [];
# env = mlb.make_env() # initialize the environment
# iter_test = env.iter_test() # iterator which loops over each date in test set
# 
for (test_df, sub) in iter_test:
    # Features computation at Evaluation Date
    sub = sub.reset_index()
    sub_fe, eval_dt = test_lag(sub)
    sub_fe = sub_fe.merge(LAST_MED_DF, on="playerId", how="left")
    sub_fe = sub_fe.fillna(0.)
    
#     _preds = 0.
#     for reg in nets:
#         _preds += reg.predict(sub_fe[FECOLS + MEDCOLS]) / NFOLDS
#     sub_fe[TGTCOLS] = np.clip(_preds, 0, 100)
#     sub.drop(["date"]+TGTCOLS, axis=1, inplace=True)
#     sub = sub.merge(sub_fe[["playerId"]+TGTCOLS], on="playerId", how="left")
#     sub.drop("playerId", axis=1, inplace=True)
#     sub = sub.fillna(0.)
#     # Submit
#     env.predict(sub)
#     # Update Available information
#     sub_fe["EvalDate"] = eval_dt
#     #sub_fe.drop(MEDCOLS, axis=1, inplace=True)
#     LAST = LAST.append(sub_fe)
#     LAST = LAST.drop_duplicates(subset=["EvalDate","playerId"], keep="last")
#"""

In [None]:
sub_fe.head()

In [None]:
LAST.shape, sub_fe.shape

In [None]:
#df_tr["dte"] = pd.to_datetime(df_tr["date"], format='%Y%m%d')