In this kernel, I have compared **three of the best baseline kernels** we have seen in this competition so far. 

Being a beginner in modeling time-series data, I decided to learn from the experienced and setup a useful training+inference workflow in the process. Thus most of the credits for the code goes to the authors of those kernels. 

I have used the following baseline kernels:
* [Ubiquant Market Prediction with DNN](https://www.kaggle.com/lonnieqin/ubiquant-market-prediction-with-dnn/notebook) by [Lonnie](https://www.kaggle.com/lonnieqin).
* [Ubiquant RNN - Training Pipeline and Inference](https://www.kaggle.com/ravishah1/ubiquant-rnn-training-pipeline-and-inference/notebook) by  [Ravi Shah](https://www.kaggle.com/ravishah1).
* [Ubiquant LGBM Baseline](https://www.kaggle.com/valleyzw/ubiquant-lgbm-baseline) by [valley](https://www.kaggle.com/valleyzw).

This notebook uses [Weights and Biases](https://wandb.ai/site) to compare the three modeling techniques. If you are using these baselines, this kernel can be a good place to learn how W&B can be used.

# Imports and Setup

In [None]:
!pip install -qq --upgrade wandb

In [None]:
import os
import gc
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from argparse import Namespace
from collections import defaultdict
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow import keras

from scipy import stats
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold

import lightgbm as lgb

import wandb
from wandb.lightgbm import log_summary, wandb_callback
from wandb.keras import WandbCallback

In [None]:
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("wandb_api")
    wandb.login(key=secret_value_0)

    anony=None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

# Load Dataset

We will be using the [Ubiquant Market Prediction half precision Pickle](https://www.kaggle.com/lonnieqin/ubiquant-market-prediction-half-precision-pickle) by Lonnie. You can find more information about the creation of this dataset in this [kernel](https://www.kaggle.com/lonnieqin/reduce-the-dataset-to-1-8g).

In [None]:
df = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
df.head()

In [None]:
features = [f'f_{i}' for i in range(300)]
target = 'target'
EPOCHS = 100

# DNN

Source: https://www.kaggle.com/lonnieqin/ubiquant-market-prediction-with-dnn/notebook

In [None]:
def make_dataset(feature, investment_id, y, batch_size=1024, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    if mode == "train":
        ds = ds.shuffle(buffer_size=batch_size*8)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

In [None]:
def get_investment_lookup(investment_id_df):
    investment_ids = list(investment_id_df.unique())
    investment_id_size = len(investment_ids) + 1
    investment_id_lookup_layer = IntegerLookup(max_tokens=investment_id_size)
    investment_id_lookup_layer.adapt(pd.DataFrame({"investment_ids": investment_ids}))
    
    return investment_id_lookup_layer, investment_id_size

def get_model(investment_id_df):
    investment_id_inputs = Input((1, ), dtype=tf.uint16)
    features_inputs = Input((300, ), dtype=tf.float16)
    
    investment_id_lookup_layer, investment_id_size = get_investment_lookup(investment_id_df)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = Reshape((-1, ))(investment_id_x)
    investment_id_x = Dense(64, activation='swish')(investment_id_x)
    investment_id_x = Dense(64, activation='swish')(investment_id_x)
    investment_id_x = Dense(64, activation='swish')(investment_id_x)
    
    feature_x = Dense(256, activation='swish')(features_inputs)
    feature_x = Dense(256, activation='swish')(feature_x)
    feature_x = Dense(256, activation='swish')(feature_x)
    
    x = Concatenate(axis=1)([investment_id_x, feature_x])
    x = Dense(512, activation='swish', kernel_regularizer="l2")(x)
    x = Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = Dense(32, activation='swish', kernel_regularizer="l2")(x)
    output = Dense(1)(x)
    model = Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    
    rmse = tf.keras.metrics.RootMeanSquaredError(name="rmse")
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', rmse])

    return model

In [None]:
tf.keras.backend.clear_session()
model = get_model(df["investment_id"])
model.summary()

In [None]:
kfold = StratifiedKFold(5, shuffle=True, random_state=42)
models = []

early_stop = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

for fold, (train_indices, valid_indices) in enumerate(kfold.split(df[features], df["investment_id"])):
    # Prepare dataset
    X_train, y_train = df[features].iloc[train_indices], df["target"].iloc[train_indices]
    X_val, y_val = df[features].iloc[valid_indices], df["target"].iloc[valid_indices]
    invest_train, invest_val = df["investment_id"][train_indices], df["investment_id"][valid_indices]
    
    # Get Dataloaders
    train_ds = make_dataset(X_train, invest_train, y_train)
    valid_ds = make_dataset(X_val, invest_val, y_val, mode="valid")

    # Get Model
    model = get_model(df["investment_id"])
    
    # Initialize W&B run
    run = wandb.init(project='ubiquant_kaggle', group='DNN', job_type='train')
    
    # Train model
    _ = model.fit(train_ds,
                  epochs=EPOCHS,
                  validation_data=valid_ds,
                  callbacks=[WandbCallback(save_model=False), early_stop])
    
    # Evaluate
    preds = model.predict(valid_ds)
    
    # Save the model
    model.save(f'DNN/models/model_{fold}')
    
    # Get rmse score
    rmse_score = np.sqrt(mean_squared_error(y_val.values, preds.ravel()))
    wandb.log({'oof_rmse': rmse_score})

    # Get pearson score
    pearson_score = stats.pearsonr(preds.ravel(), y_val.values)[0]
    wandb.log({'oof_pearsonr': pearson_score})
    
    # Clear W&B run
    wandb.finish()
    
    del invest_train, invest_val, X_train, X_val, y_train, y_val, train_ds, valid_ds
    gc.collect()

In [None]:
run = wandb.init(project='ubiquant_kaggle', group='DNN', job_type='save_model')
model_artifact = wandb.Artifact(name='Baseline_DNN', type='dnn_model')
model_artifact.add_dir('DNN/models')
run.log_artifact(model_artifact)
wandb.finish()

# RNN

Source: https://www.kaggle.com/ravishah1/ubiquant-rnn-training-pipeline-and-inference/notebook

In [None]:
def setup_cv(df, X, y, groups, splits=5):
    kf = GroupKFold(n_splits=splits)
    for f, (t_, v_) in enumerate(kf.split(X=X, y=y, groups=groups)):
            df.loc[v_, 'fold'] = f

    return df

In [None]:
def get_rnn_v2():
    f300_in = Input(shape=(300,), name='300 feature input')
    x = BatchNormalization(name='batch_norm1')(f300_in)
    x = Dense(256, activation='swish', name='dense1')(x)
    x = Dropout(0.1, name='dropout1')(x)
    x = Reshape((1, -1), name='reshape1')(x)
    x = BatchNormalization(name='batch_norm2')(x)
    x = LSTM(128, dropout=0.3, recurrent_dropout=0.3, return_sequences=True, activation='relu', name='lstm1')(x)
    x = LSTM(16, dropout=0.1, return_sequences=False, activation='relu', name='lstm2')(x)
    output_layer = Dense(1, name='output')(x)

    model = Model([f300_in], 
                    [output_layer])

    rmse = tf.keras.metrics.RootMeanSquaredError(name="rmse")
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse', metrics=['mse', rmse])

    return model

class UbiquantRNNV2:
    def __init__(self, df: pd.DataFrame, feature_cols: list=None, target: str='target'):

        self.model = get_rnn_v2()

        self.df = df

        if feature_cols is not None:
            self.feature_cols = feature_cols
        else:
            self.feature_cols = [f"f_{i}" for i in range(300)]

        self.target_col = target

    def train_one_fold(self, f: int, max_epochs=10, log_wandb=True):
        X_train = self.df[self.df.fold!=f][self.feature_cols]
        X_valid = self.df[self.df.fold==f][self.feature_cols]

        y_train = self.df[self.df.fold!=f][self.target_col]
        y_valid = self.df[self.df.fold==f][self.target_col]
        
        if log_wandb:
            run = wandb.init(project='ubiquant_kaggle', group='RNN')

        self.model.fit(X_train, y_train,
                       validation_data=(X_valid, y_valid),
                       batch_size=512, epochs=EPOCHS,
                       callbacks=[
                         WandbCallback(save_model=False),
                         tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_delta=1e-4, mode='min'),
                         tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5, mode='min', baseline=None, restore_best_weights=True)
            ])

        oof = self.model.predict(X_valid)
        
        if log_wandb:
            wandb.log({'oof_rmse': self.compute_rmse(y_valid, oof)})
            wandb.log({'oof_pearsonr': self.compute_pearsonr(oof.ravel(), y_valid.values)})
            wandb.finish()
            
        del X_train, X_valid, y_train, y_valid
        _ = gc.collect()

    def predict(self, X: np.ndarray):
        preds = self.model.predict(X)
        return preds

    def save(self, path: str, f: int):
        self.model.save(f'{path}/model_{f}.h5')
        
    def oof_save(self):
        self.df[['target', 'preds']].to_csv('rnn_oof.csv', index=False)
        
    def compute_rmse(self, y_true, y_pred):
        return np.sqrt(mean_squared_error(y_true, y_pred))
    
    def compute_pearsonr(self, y_true, y_pred):
        return pearsonr(y_pred, y_true)[0]

In [None]:
uneven_group = np.sort(np.random.randint(0, 1000, len(df)))
fold_df = setup_cv(df, df[features], df["investment_id"], uneven_group)

In [None]:
tf.keras.backend.clear_session()
train_ubiquant_rnn = UbiquantRNNV2(df)

In [None]:
for fold in range(5):
    train_ubiquant_rnn.train_one_fold(fold)
    train_ubiquant_rnn.save('RNN/models', fold)

In [None]:
run = wandb.init(project='ubiquant_kaggle', group='RNN', job_type='save_model')
model_artifact = wandb.Artifact(name='Baseline_RNN', type='rnn_model')
model_artifact.add_dir('RNN/models')
run.log_artifact(model_artifact)
wandb.finish()

# LightGBM

In [None]:
args = Namespace(
    debug=False,
    seed=21,
    folds=5,
    workers=4,
    min_time_id=None, 
    num_bins=16,
    data_path=Path("parquets"),
)

In [None]:
time_id_df = (
    df.filter(regex=r"^(?!f_).*")
    .groupby("investment_id")
    .agg({"time_id": ["min", "max"]})
    .reset_index()
)
time_id_df["time_span"] = time_id_df["time_id"].diff(axis=1)["max"]

train = df.merge(time_id_df.drop(columns="time_id").droplevel(level=1, axis=1), on="investment_id")
train.time_span.hist(bins=args.num_bins, figsize=(16,8))
del time_id_df
gc.collect()

In [None]:
train["fold"] = -1
_target = pd.cut(train.time_span, args.num_bins, labels=False)
skf = StratifiedKFold(n_splits=args.folds)
for fold, (train_index, valid_index) in enumerate(skf.split(_target, _target)):
    train.loc[valid_index, 'fold'] = fold
    
fig, axs = plt.subplots(nrows=args.folds, ncols=1, sharex=True, figsize=(16,8), tight_layout=True)
for ax, (fold, df) in zip(axs, train[["fold", "time_span"]].groupby("fold")):
    ax.hist(df.time_span, bins=args.num_bins)
    ax.text(0, 40000, f"fold: {fold}, count: {len(df)}", fontsize=16)
plt.show()
del _target, train_index, valid_index
_=gc.collect()

In [None]:
cat_features = ["investment_id"]
num_features = list(train.filter(like="f_").columns)
features = num_features + cat_features

train = train.drop(columns="time_span")
train[["investment_id", "time_id"]] = train[["investment_id", "time_id"]].astype(np.uint16)
train["fold"] = train["fold"].astype(np.uint8)
gc.collect()
features += ["time_id"] # https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302429
len(features)

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def feval_rmse(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'rmse', rmse(y_true, y_pred), False

# https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302480
def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', pearsonr(y_true, y_pred)[0], True

def run():    
    params = {
        'learning_rate':0.05,
        "objective": "regression",
        "metric": "rmse",
        'boosting_type': "gbdt",
        'verbosity': -1,
        'n_jobs': -1, 
        'seed': args.seed,
        'lambda_l1': 2.7223413643193285e-08, 
        'lambda_l2': 0.009462714717237544, 
        'num_leaves': 108, 
        'feature_fraction': 0.5298125662824026, 
        'bagging_fraction': 0.7279540797730281, 
        'bagging_freq': 6, 
        'max_depth': 10, 
        'max_bin': 487, 
        'min_data_in_leaf': 158,
        'n_estimators': 1000, 
    }
    
    y = train['target']
    train['preds'] = -1000
    scores = defaultdict(list)
    features_importance= pd.DataFrame()
    
    for fold in range(args.folds):
        print(f"=====================fold: {fold}=====================")
        trn_ind, val_ind = train.fold!=fold, train.fold==fold
        print(f"train length: {trn_ind.sum()}, valid length: {val_ind.sum()}")
        train_dataset = lgb.Dataset(train.loc[trn_ind, features], y.loc[trn_ind], categorical_feature=cat_features)
        valid_dataset = lgb.Dataset(train.loc[val_ind, features], y.loc[val_ind], categorical_feature=cat_features)
        
        run = wandb.init(project='ubiquant_kaggle', group='LGBM')
        
        model = lgb.train(
            params,
            train_set = train_dataset, 
            valid_sets = [train_dataset, valid_dataset], 
            verbose_eval=100,
            early_stopping_rounds=50,
            feval = feval_pearsonr,
            callbacks=[wandb_callback()]
        )
        joblib.dump(model, f'LGBM/lgbm_seed{args.seed}_{fold}.pkl')

        preds = model.predict(train.loc[val_ind, features])
        train.loc[val_ind, "preds"] = preds
        
        scores["rmse"].append(rmse(y.loc[val_ind], preds))
        scores["pearsonr"].append(pearsonr(y.loc[val_ind], preds)[0])
        
        fold_importance_df= pd.DataFrame({'feature': features, 'importance': model.feature_importance(), 'fold': fold})
        features_importance = pd.concat([features_importance, fold_importance_df], axis=0)
        
        wandb.log({f'oof_rmse': rmse(y.loc[val_ind], preds),
                   f'oof_pearsonr': pearsonr(y.loc[val_ind], preds)[0]})
        
        del train_dataset, valid_dataset, model
        gc.collect()
    print(f"lgbm {args.folds} folds mean rmse: {np.mean(scores['rmse'])}, mean pearsonr: {np.mean(scores['pearsonr'])}")
    train.filter(regex=r"^(?!f_).*").to_csv("preds.csv", index=False)
    return features_importance

In [None]:
features_importance = run()
df = train[["target", "preds"]].query("preds!=-1000")
print(f"lgbm {args.folds} folds mean rmse: {rmse(df.target, df.preds)}, mean pearsonr: {pearsonr(df.target, df.preds)[0]}")
del df, train
gc.collect()

In [None]:
run = wandb.init(project='ubiquant_kaggle', group='LGBM', job_type='save_model')
model_artifact = wandb.Artifact(name='Baseline_LGBM', type='lgbm_model')
model_artifact.add_dir('LGBM/')
run.log_artifact(model_artifact)
wandb.finish()