# Ubiquant Market Prediction with cumsum dataset

This notebook makes an assumption that the provided 300 features are actually asset returns of some investment. There are about 1500 investments which could be some different subfunds or different traiders within some fund. The different invesments are independent of each other and should be trained separately. The best choise is to train them as separate models.
Thus I desided to combine feature values with cumsum grouped by investment_id. The same way there were combined investment targets.
The tests showed amazing correlation of 0.9 but in live we went into some issues that actually made this model a bad shoice:
- there are investments that are completely missing in production as well as having as low as 2 trade activities.
- when we train model with loss function to minimize a Pearson correlation the obtained result has a very high RMSE which makes problematic calculating the next step data. Normalizing targets helps us but not too much.
- because of the data becoming an obviously time-series we cannot shuffle it and have to split it by `TimeSeriesSplit`. Because of that sometimes the last fold fails to train.

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow.python.keras import backend as K
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
import tensorflow_addons as tfa

from scipy import stats
from tqdm import tqdm
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, ShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit
from scipy import stats
from datetime import datetime
import timeit

tf.config.optimizer.set_jit(True)

#from tensorflow.keras.mixed_precision import experimental as mixed_precision
#mixed_precision.set_policy('mixed_bfloat16')
#DATA_DTYPE = tf.bfloat16

tf.config.list_physical_devices('GPU')

n_features = 300
features = [f'f_{i}' for i in range(n_features)]
features_target = features + ['target']

## Creating cumsum datasets

In [None]:
%%script echo skipping
# import original dataset for debug

train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

In [None]:
%%script echo skipping
# create cumsum dataset

investments = train.investment_id.unique()
#investments = investments[investments<50]
#train_orig = train.copy()

train[features_target] = train[features_target].astype(np.float32)
for investment_id in tqdm(investments):
    train.loc[train.investment_id==investment_id,features_target] = train[train.investment_id==investment_id][features_target].cumsum()

# alternative way of summing data but is less convenient than the simple loop, because there is no tqdm
#train[features_target] = train[features_target].groupby('investment_id').apply(lambda group: group.astype(np.float32).cumsum()).astype(np.float16)

train[features_target] = train[features_target].astype(np.float16)
train.to_pickle('ubiquant-market-prediction-cumsum-f16.pkl')

In [None]:
%%script echo skipping
# create normalized targets

train.set_index('investment_id', inplace=True)
normalize_ds = train['target'].groupby('investment_id').apply(lambda group: (group.mean(), group.std()))

#train['target'] = train['target'].groupby('investment_id').apply(lambda group: (group-group.mean())/group.std())
for investment_id,gr in tqdm(train['target'].groupby('investment_id')):
    train.loc[investment_id,'target'] = (gr-gr.mean())/gr.std()
    
train.reset_index(inplace=True)

train = train.copy()
print(train['target'].max(), train['target'].min())

train['target'].to_pickle('ubiquant-cumsum-nomalized-target.pkl')

In [None]:
%%script echo skipping
# create cumsum last row

def get_last_row(train):
    last_row = pd.DataFrame(columns=train.columns)
    for x in train.investment_id.unique():
        if not train[train.investment_id==x].shape[0]: continue
        row = train[train.investment_id==x].iloc[-1]
        last_row = last_row.append(row)

    last_row['time_id'] = last_row['time_id'].astype('int')
    last_row['investment_id'] = last_row['investment_id'].astype('int')
    last_row = last_row.set_index('investment_id').sort_index()
    return last_row

#train_dev = train[train.time_id<500]
last_row = get_last_row(train)
last_row.to_pickle('ubiquant-cumsum-nomalized-lastrow.pkl')
last_row

## Loading cumsum datasets

In [None]:
# load cumsum dataset

train = pd.read_pickle('../input/ubiquant-cumsum-data-f16/ubiquant-market-prediction-cumsum-f16.pkl')
train[train.time_id==1000].head()

In [None]:
print('max: ', train['target'].max(), 'min: ', train['target'].min())

In [None]:
# the special case of investment where on 2 activities exist

#train.loc[pd.IndexSlice[:, 1415], 'f_5']
train[train.investment_id==1415][['time_id','f_5']]

In [None]:
print('time_id.unique:', len(train.time_id.unique()))
train[train.investment_id<10][['time_id','investment_id','target']].astype({'target':np.float32}).pivot(index='time_id',columns='investment_id',values='target').plot()

In [None]:
# load the normalized targets and the last row

train.set_index('investment_id', inplace=True)
normalize_ds = pd.DataFrame(index=train.index.unique())
normalize_ds[['mean','std']] = train['target'].groupby('investment_id').apply(lambda group: (group.mean(), group.std())).to_list()
train.reset_index(inplace=True)

train['target'] = pd.read_pickle('../input/ubiquant-cumsum-data-sources-2/ubiquant-cumsum-nomalized-target.pkl')
last_row = pd.read_pickle('../input/ubiquant-cumsum-data-sources-2/ubiquant-cumsum-nomalized-lastrow.pkl')

## Preprocessing and Modeling

In [None]:
investment_ids = train["investment_id"]
y = train["target"]

In [None]:
def get_model_rmse(investment_ids): 
    investment_ids = list(investment_ids.unique())
    investment_id_size = len(investment_ids) + 1
    
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    
    fc = tf.feature_column.categorical_column_with_vocabulary_list(key='investment_id', vocabulary_list=investment_ids)
    fc = tf.feature_column.embedding_column(categorical_column=fc, dimension=64)
    investment_id_x = tf.keras.layers.DenseFeatures([fc])({'investment_id':investment_id_inputs})
    #investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_inputs)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)    
    investment_id_x = layers.Dropout(0.2)(investment_id_x)
   
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    feature_x = layers.BatchNormalization()(features_inputs)
    feature_x = layers.GaussianNoise(0.1)(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dropout(0.2)(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(64, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    #x = layers.LayerNormalization()(x)
    output = layers.Dense(1)(x)
    
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.01), loss=rmse, metrics=[rmse,pearson_r])
    return model

def preprocess(X, y):
    return X, y

def make_dataset(feature, investment_id, y, batch_size, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    ds = ds.map(preprocess)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0

def make_test_dataset(feature, investment_id, batch_size):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    ds = ds.map(preprocess_test)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

def inference(models, ds):
    y_preds = []
    for model in models:
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

def pearson_r(x, y):
    axis = 1
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return -corr

In [None]:
%%script echo skipping
# train model with sample data - DEBUG

res_compare = {}
res_real, res_pred = [], []

dev_ds = train[(train.time_id<500)&(train.investment_id<100)]
dev_ds.set_index('investment_id', inplace=True)
normalize_ds = pd.DataFrame(index=dev_ds.index.unique())
normalize_ds[['mean','std']] = dev_ds['target'].groupby('investment_id').apply(lambda group: (group.mean(), group.std())).to_list()
dev_ds.reset_index(inplace=True)

last_row = get_last_row(dev_ds)
iter_test = train_orig[(train_orig.time_id>=500)&(train_orig.time_id<600)&(train_orig.investment_id<100)]# & (train_orig.time_id<600)]

iter_test['row_id'] = iter_test.index
iter_test['time_id'] = iter_test['time_id'].astype('int')
iter_test['investment_id'] = iter_test['investment_id'].astype('int')
iter_test = [x[1] for x in iter_test.groupby('time_id')]

for i, test_df in tqdm(enumerate(iter_test)):
    test_df = test_df.set_index('investment_id').sort_index()
    pred_df = pd.DataFrame(index=test_df.index, data={'target':0.0, 'row_id':test_df.row_id})
    #print(i, len(test_df.index))
    
    test_investment_ids = test_df.index.values
    intersect = list(set(last_row.index) & set(test_df.index))
    last_row.loc[intersect,features] = last_row.loc[intersect,features] + test_df.loc[intersect,features]
    new_investments = test_df[~test_df.index.isin(last_row.index)]
    if new_investments.shape[0] > 0:
        last_row = last_row.append(new_investments)
        normalize_ds = normalize_ds.append(pd.DataFrame(index=new_investments.index, data={'mean':0, 'std':1}))
    
    #print(i, 'step-2')
    ds = make_test_dataset(last_row.loc[test_investment_ids, features].values, last_row.loc[test_investment_ids].index.values, BATCH_SIZE)
    y_pred = inference(models, ds)
    y_pred = pd.Series(index=test_investment_ids, data=y_pred.squeeze())
    #print(last_row.loc[test_investment_ids, 'target'])
    #print(y_pred)
    
    #print(i, 'step-3')
    for _, row in test_df.iterrows():
        if pred_df[pred_df.row_id==row.row_id].shape[0] == 0: 
            continue
            
        y_pred.loc[row.name] = y_pred.loc[row.name] * normalize_ds.loc[row.name, 'std'] + normalize_ds.loc[row.name, 'mean']
        pred_df.loc[pred_df.row_id==row.row_id, 'target'] = y_pred.loc[row.name] - last_row.loc[row.name,'target']
        last_row.loc[row.name,'target'] = y_pred.loc[row.name]
    
    #print(y_pred)
    
    #print(i, 'step-4')
    res_real = res_real + test_df['target'].values.tolist()
    res_pred = res_pred + pred_df['target'].values.tolist()
    for _, row in test_df.iterrows():
        investment_id = int(row.name)
        if not investment_id in res_compare: 
            res_compare[investment_id] = {'real':[], 'pred':[]}
        res_compare[investment_id]['real'].append(row.target)
        res_compare[investment_id]['pred'].append(pred_df.loc[investment_id].values[0])


res = [stats.pearsonr(res_compare[i]['real'], res_compare[i]['pred'])[0] for i in res_compare if len(res_compare[i]['real'])>2]
print('pearson by investment: ', np.array(res)[~np.isnan(res)].mean())
print('pearson total: ', stats.pearsonr(res_real, res_pred)[0])

In [None]:
%%script echo skipping
# training the models

BATCH_SIZE = 512
dev_ds = train
models = []
kfold = TimeSeriesSplit(n_splits=5) 
#kfold = StratifiedKFold(5, random_state=42)
for index, (train_indices, valid_indices) in enumerate(kfold.split(dev_ds, investment_ids)):
    #if index < 4: continue
        
    X_train, X_val = dev_ds.loc[train_indices, features], dev_ds.loc[valid_indices, features]
    y_train, y_val = y.loc[train_indices], y.loc[valid_indices]
    investment_id_train, investment_id_val = investment_ids.loc[train_indices], investment_ids.loc[valid_indices]
    
    train_ds = make_dataset(X_train, investment_id_train, y_train, BATCH_SIZE)
    valid_ds = make_dataset(X_val, investment_id_val, y_val, BATCH_SIZE, mode="valid")
    model = get_model_rmse(investment_ids)
    
    reduce_lr  = keras.callbacks.ReduceLROnPlateau(monitor='val_pearson_r', factor=0.25, min_lr=0.0001, patience=5, mode='min', verbose=1)
    checkpoint = keras.callbacks.ModelCheckpoint(f"model_{index}.h5", monitor='val_pearson_r', mode='min', save_best_only=True, save_weights_only=True)
    early_stop = keras.callbacks.EarlyStopping(monitor="val_pearson_r", mode='min', patience=7)
    history = model.fit(train_ds, epochs=50, validation_data=valid_ds, callbacks=[reduce_lr, checkpoint, early_stop])
    #plt.plot(np.array(history.history['val_pearson_r'])-history.history['val_pearson_r'][0], label='Val Metrics')
    #plt.plot(np.array(history.history['val_loss'])-history.history['val_loss'][0], label='Val Loss')
    #plt.legend(loc="upper right")
    #plt.show()

    model.load_weights(f"model_{index}.h5")#, custom_objects={"pearson_r": pearson_r})
    models.append(model)
    
    y_pred = model.predict(valid_ds).ravel()
    print(y_pred[:10], y_pred[-10:])
    print(y_val.values[:10], y_val.values[-10:])
    print('Pearson:', stats.pearsonr(y_pred, y_val.values)[0])
    print('RMSE:', np.sqrt(mean_squared_error(y_pred, y_val.values)))
    
    del investment_id_train
    del investment_id_val
    del X_train
    del X_val
    del y_train
    del y_val
    del train_ds
    del valid_ds
    tf.keras.backend.clear_session()
    gc.collect()
    #break

## Submission

In [None]:
BATCH_SIZE = 512
models = []

for index in range(4):
    model = get_model_rmse(investment_ids)
    #model.load_weights(f"../input/ubiquant-5folds-cumsum/model_{index}.h5")
    model.load_weights(f"../input/ubiquant-4folds-cumsum/model_{index}.h5")
    models.append(model)

In [None]:
import ubiquant
import sys
env = ubiquant.make_env()
iter_test = env.iter_test() 

for i, (test_df, pred_df) in enumerate(iter_test):
    test_df = test_df.set_index('investment_id').sort_index()
    
    # update last row with new data
    test_investment_ids = test_df.index.values
    intersect = list(set(last_row.index) & set(test_df.index))
    last_row.loc[intersect,features] = last_row.loc[intersect,features] + test_df.loc[intersect,features]
    new_investments = test_df[~test_df.index.isin(last_row.index)]
    
    # update the last row if it didn't exist yes because of the new investment_id
    if new_investments.shape[0] > 0:
        last_row = last_row.append(new_investments)
        normalize_ds = normalize_ds.append(pd.DataFrame(index=new_investments.index, data={'mean':0, 'std':1}))
        
    # predict new values based on existing "last row".
    # here we obtain the real absolute values and not the deltas that we go on with
    ds = make_test_dataset(last_row.loc[test_investment_ids, features].values, last_row.loc[test_investment_ids].index.values, BATCH_SIZE)
    y_pred = inference(models, ds)
    y_pred = pd.Series(index=test_investment_ids, data=y_pred.squeeze())
    
    for _, row in test_df.iterrows():
        if pred_df[pred_df.row_id==row.row_id].shape[0] == 0: 
            continue
            
        # reverse-normalizing the predicted values
        y_pred.loc[row.name] = y_pred.loc[row.name] * normalize_ds.loc[row.name, 'std'] + normalize_ds.loc[row.name, 'mean']
        
        # calulate the target deltas because we obrained the real absolute values 
        pred_df.loc[pred_df.row_id==row.row_id, 'target'] = y_pred.loc[row.name] - last_row.loc[row.name,'target']
        last_row.loc[row.name,'target'] = y_pred.loc[row.name]
    
    env.predict(pred_df.fillna(0)) 