# Model Ensemble DNN + TabNet
- TabNet: https://www.kaggle.com/wangqihanginthesky/baseline-tabnet/notebook
- DNN: https://www.kaggle.com/andrej0marinchenko/ubiquant-market-prediction-dnn

In [None]:
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl
!pip -q install ../input/talib-binary/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl

In [None]:
import os
import gc
import joblib
import random
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from argparse import Namespace
from collections import defaultdict


import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from scipy import stats

import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, train_test_split

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer


import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 64)

def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    

In [None]:
args = Namespace(
    INFER=True,
    debug=False,
    seed=21,
    folds=5,
    workers=4,
    min_time_id=None, 
    holdout=True,
    num_bins=16,
    data_path=Path("../input/ubiquant-market-prediction-half-precision-pickle"),
    dnn_path = '../input/dnnmodel',
    tabnet_path = '../input/ubiquanttabnetbaseline'
)
seed_everything(args.seed)

if args.debug:
    setattr(args, 'min_time_id', 1100)


## Import Dataset

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

In [None]:
investment_id = train.pop("investment_id")
investment_id.head()

In [None]:
_ = train.pop("time_id")

In [None]:
y = train.pop("target")
y.head()

## Create IntegerLookup Later for investment_id input

In [None]:
%%time
investment_ids = list(investment_id.unique())
investment_id_size = len(investment_ids) + 1
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
investment_id_lookup_layer.adapt(pd.DataFrame({"investment_ids":investment_ids}))

## Make Tensorflow Dataset

In [None]:
import tensorflow as tf
def preprocess(X, y):
    return X, y
def make_dataset(feature, investment_id, y, batch_size=1024, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    ds = ds.map(preprocess)
    if mode == "train":
        ds = ds.shuffle(4096)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

## DNN Model

In [None]:
def get_model():
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)    
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
   # investment_id_x = layers.Dropout(0.65)(investment_id_x)
   
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dropout(0.65)(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dense(512, activation='swish', kernel_regularizer="l2")(x)
   # x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
  #  x = layers.Dropout(0.4)(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dropout(0.75)(x)
    output = layers.Dense(1)(x)
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', "mae", "mape", rmse])
    return model

In [None]:
model = get_model()
model.summary()
keras.utils.plot_model(model, show_shapes=True)

In [None]:
%%time
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(2, shuffle=True, random_state=42)
models = []
for index, (train_indices, valid_indices) in enumerate(kfold.split(train, investment_id)):
    X_train, X_val = train.iloc[train_indices], train.iloc[valid_indices]
    investment_id_train = investment_id[train_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[valid_indices]
    investment_id_val = investment_id[valid_indices]
    train_ds = make_dataset(X_train, investment_id_train, y_train)
    valid_ds = make_dataset(X_val, investment_id_val, y_val, mode="valid")
    model = get_model()
    checkpoint = keras.callbacks.ModelCheckpoint(f"model_{index}", save_best_only=True)
    early_stop = keras.callbacks.EarlyStopping(patience=10)
    history = model.fit(train_ds, epochs=30, validation_data=valid_ds, callbacks=[checkpoint, early_stop])
    models.append(keras.models.load_model(f"model_{index}"))
    
    pearson_score = stats.pearsonr(model.predict(valid_ds).ravel(), y_val.values)[0]
    print('Pearson:', pearson_score)
    pd.DataFrame(history.history, columns=["mse", "val_mse"]).plot()
    plt.title("MSE")
    plt.show()
    pd.DataFrame(history.history, columns=["mae", "val_mae"]).plot()
    plt.title("MAE")
    plt.show()
    pd.DataFrame(history.history, columns=["rmse", "val_rmse"]).plot()
    plt.title("RMSE")
    plt.show()
    del investment_id_train
    del investment_id_val
    del X_train
    del X_val
    del y_train
    del y_val
    del train_ds
    del valid_ds
    gc.collect()
    break

In [None]:
import os
import gc
import joblib
import random
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from argparse import Namespace
from collections import defaultdict

import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, train_test_split


import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 64)

def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
import lightgbm as lgb

In [None]:
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer

In [None]:
args = Namespace(
    INFER=True,
    debug=False,
    seed=21,
    folds=5,
    workers=4,
    min_time_id=None, 
    holdout=True,
    num_bins=16,
    data_path=Path("../input/ubiquant-parquet/"),
)
seed_everything(args.seed)

if args.debug:
    setattr(args, 'min_time_id', 1100)

In [None]:
%%time
train = pd.read_parquet('../input/ubiquanttabnetbaseline/valid.parquet')
assert train.isnull().any().sum() == 0, "null exists."
assert train.row_id.str.extract(r"(?P<time_id>\d+)_(?P<investment_id>\d+)").astype(train.time_id.dtype).equals(train[["time_id", "investment_id"]]), "row_id!=time_id_investment_id"

if args.min_time_id is not None:
    train = train.query("time_id>=@args.min_time_id").reset_index(drop=True)
    gc.collect()
train.shape

In [None]:
time_id_df = (
    train.filter(regex=r"^(?!f_).*")
    .groupby("investment_id")
    .agg({"time_id": ["min", "max"]})
    .reset_index()
)
time_id_df["time_span"] = time_id_df["time_id"].diff(axis=1)["max"]
time_id_df.head(6)

In [None]:
train = train.merge(time_id_df.drop(columns="time_id").droplevel(level=1, axis=1), on="investment_id")
train.time_span.hist(bins=args.num_bins, figsize=(16,8))
del time_id_df
gc.collect()

In [None]:
if args.holdout:
    _target = pd.cut(train.time_span, args.num_bins, labels=False)
    _train, _valid = train_test_split(_target, stratify=_target)
    print(f"train length: {len(_train)}", f"holdout length: {len(_valid)}")
    valid = train.iloc[_valid.index].sort_values(by=["investment_id", "time_id"]).reset_index(drop=True)
    train = train.iloc[_train.index].sort_values(by=["investment_id", "time_id"]).reset_index(drop=True)
    train.time_span.hist(bins=args.num_bins, figsize=(16,8), alpha=0.8)
    valid.time_span.hist(bins=args.num_bins, figsize=(16,8), alpha=0.8)
    valid.drop(columns="time_span").to_parquet("valid.parquet")
    del valid, _train, _valid, _target
    gc.collect()

In [None]:
train["fold"] = -1
_target = pd.cut(train.time_span, args.num_bins, labels=False)
skf = StratifiedKFold(n_splits=args.folds)
for fold, (train_index, valid_index) in enumerate(skf.split(_target, _target)):
    train.loc[valid_index, 'fold'] = fold
    
fig, axs = plt.subplots(nrows=args.folds, ncols=1, sharex=True, figsize=(16,8), tight_layout=True)
for ax, (fold, df) in zip(axs, train[["fold", "time_span"]].groupby("fold")):
    ax.hist(df.time_span, bins=args.num_bins)
    ax.text(0, 40000, f"fold: {fold}, count: {len(df)}", fontsize=16)
plt.show()
del _target, train_index, valid_index
_=gc.collect()

In [None]:
cat_features = ["investment_id"]
num_features = list(train.filter(like="f_").columns)
features = num_features + cat_features

train = reduce_mem_usage(train.drop(columns="time_span"))
train[["investment_id", "time_id"]] = train[["investment_id", "time_id"]].astype(np.uint16)
train["fold"] = train["fold"].astype(np.uint8)
gc.collect()
features += ["time_id"] # https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302429
len(features)

In [None]:
X = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return mean_squared_error(y_true,y_pred, squared=False)
def rmspe(y_true, y_pred):
    # Function to calculate the root mean squared percentage error
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

class RMSPE(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_score):
        
        return np.sqrt(np.mean(np.square((y_true - y_score) / y_true)))
    

def RMSPELoss(y_pred, y_true):
    return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 )).clone()



cat_idxs = [ i for i, f in enumerate(X.columns.tolist()) if f in cat_features]


def run():    
    tabnet_params = dict(
        cat_idxs=cat_idxs,
        cat_emb_dim=1,
        n_d = 16,
        n_a = 16,
        n_steps = 2,
        gamma =1.4690246460970766,
        n_independent = 9,
        n_shared = 4,
        lambda_sparse = 0,
        optimizer_fn = Adam,
        optimizer_params = dict(lr = (0.0024907164557092944)),
        mask_type = "entmax",
        scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
        scheduler_fn = CosineAnnealingWarmRestarts,
        seed = 42,
        verbose = 10, 
    )    
    y = train['target']
    train['preds'] = -1000
    scores = defaultdict(list)
    features_importance= pd.DataFrame()
    
    for fold in range(args.folds):
        print(f"=====================fold: {fold}=====================")
        trn_ind, val_ind = train.fold!=fold, train.fold==fold
        print(f"train length: {trn_ind.sum()}, valid length: {val_ind.sum()}")
        X_train=train.loc[trn_ind, features].values
        y_train=y.loc[trn_ind].values.reshape(-1,1)
        X_val=train.loc[val_ind, features].values
        y_val=y.loc[val_ind].values.reshape(-1,1)

        clf =  TabNetRegressor(**tabnet_params)
        clf.fit(
          X_train, y_train,
          eval_set=[(X_val, y_val)],
          max_epochs = 355,
          patience = 50,
          batch_size = 1024*20, 
          virtual_batch_size = 128*20,
          num_workers = 4,
          drop_last = False,

          )
        
        clf.save_model(f'TabNet_seed{args.seed}_{fold}')


        preds = clf.predict(train.loc[val_ind, features].values)
        train.loc[val_ind, "preds"] = preds
        
        scores["rmse"].append(rmse(y.loc[val_ind], preds))
     
        del X_train,X_val,y_train,y_val
        gc.collect()
        
        
    print(f"TabNet {args.folds} folds mean rmse: {np.mean(scores['rmse'])}")
    train.filter(regex=r"^(?!f_).*").to_csv("preds.csv", index=False)
 #   return features_importance

## TabNet

In [None]:
if args.INFER:
    pass
else:
    run()  
del df, train
gc.collect()

### Train Model

In [None]:
import os
import zipfile
 
def zipDir(dirpath, outFullName):

    zip = zipfile.ZipFile(outFullName, "w", zipfile.ZIP_DEFLATED)
    for path, dirnames, filenames in os.walk(dirpath):

        fpath = path.replace(dirpath, '')

        for filename in filenames:
            zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
    zip.close()
    

if args.INFER:
    for fold in range(5):
        input_path =f'../input/tabnetv1/TabNet_seed{args.seed}_{fold}'
        output_path = f"./fold{fold}.zip"
        zipDir(input_path, output_path)
else:
    input_path =f'./TabNet_seed{args.seed}_{fold}'
    output_path = f"./fold{fold}.zip"

    zipDir(input_path, output_path)
tabnet_params = dict(
        cat_idxs=cat_idxs,
        cat_emb_dim=1,
        n_d = 16,
        n_a = 16,
        n_steps = 2,
        gamma =1.4690246460970766,
        n_independent = 9,
        n_shared = 4,
        lambda_sparse = 0,
        optimizer_fn = Adam,
        optimizer_params = dict(lr = (0.0024907164557092944)),
        mask_type = "entmax",
        scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
        scheduler_fn = CosineAnnealingWarmRestarts,
        seed = 42,
        verbose = 10, 
    )    

import copy
clf =  TabNetRegressor(**tabnet_params)
models_tabnet = []
for fold in range(args.folds):
    clf.load_model(f"../input/baseline-tabnet/fold{fold}.zip")
    model=copy.deepcopy(clf)
    models_tabnet.append(model)

## Prepare Test Inference

In [None]:
def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0

def make_test_dataset(feature, investment_id, batch_size=1024):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    ds = ds.map(preprocess_test)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

def make_test_dataset_lgbm(test_df,folds=5):
    features = [f"f_{i}" for i in range(300)]
    test_df[features] = scaler.fit_transform(test_df[features]) 
    clu = [kmodels[fold].predict(test_df[features]) for fold in range(folds)]
    test_df_l = [test_df for fold in range(folds)]
    for f in range(folds):
        test_df_l[f]['cluster'] = clu[f]
    return test_df_l

def inference(models, ds):
    y_preds = []
    for model in models:
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

def inference_lgbm(models,ds,folds=5):
    features = [f"f_{i}" for i in range(300)]
    features_1 = features + ['cluster']
    final_pred = [models[fold].predict(ds[fold][features_1]) for fold in range(folds)]
    return np.mean(np.stack(final_pred), axis=0)

def inference_tabnet(models,test_df,args):
    num_features = [f"f_{i}" for i in range(300)]
    cat_features = ["investment_id"]
    features = num_features + cat_features
    features += ["time_id"]
    test_df["time_id"] = test_df.row_id.str.extract(r"(\d+)_.*").astype(np.uint16) # extract time_id form row_id
    final_pred = [models[fold].predict(test_df[features].values) for fold in range(args.folds)]
    return np.mean(np.stack(final_pred), axis=0)


In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:

    features_dnn = [f'f_{i}' for i in range(300)]
    ds = make_test_dataset(test_df[features_dnn], test_df["investment_id"])

    tabnet_output = inference_tabnet(models_tabnet,test_df,args)
    dnn_output = inference(models,ds)
    final_output = dnn_output * 0.85 + tabnet_output *0.15
    #final_output = tabnet_output
    sample_prediction_df['target'] = final_output
    env.predict(sample_prediction_df) 