# References

https://www.kaggle.com/code/valleyzw/ubiquant-lgbm-baseline/notebook#Stock-market-calendar-analysis:-discussion

[Which CV strategy should I use? Need help. (Discussion)](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302429)

https://www.kaggle.com/code/freaxmind/efficient-fold-creation-using-python-slices

[How (and why) to create a good validation set (Fast.ai)](https://www.fast.ai/2017/11/13/validation-sets/)

From the Q&A: The investment_ids that appear in the train data, the public leaderboard, and the private leaderboard are not the same, some only appear in the train data, some only in public leaderboard and some only in the private leaderboard.

Some takeaways from all the above:

* Split the training data into two groups by date, e.g. 950 training and 250 validation (similar to public leaderboard) for a total of 1200
* Train on the large group applying some cross validation technique, such that it leaves out some of the assets so they are new data not seen until the validation set 


References on dealing with outliers:

https://www.kaggle.com/code/junjitakeshima/ubiquant-simple-lgbm-removing-outliers-en-jp/notebook#(6)-Build-model

https://www.kaggle.com/code/valleyzw/ubiquant-lgbm-baseline/notebook#Target-analysis (Targets and outliers from this notebook)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
        
from tqdm import tqdm
import gc
from sklearn.model_selection import train_test_split, GroupKFold

# Metric for evaluation

Submissions are evaluated on the mean of the Pearson correlation coefficient for each time ID.

In [None]:
# using TensorFlow
def tf_correlation(x, y, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)

    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr

In [None]:
# Correlation with numpy
X = np.array([0.5, -0.3, 0, -1])
Y = np.array([0.1, -0.5, 0.1, -0.9])

n = len(X)
xmean = X.sum() / n
ymean = Y.sum() / n

cov = ((X - xmean) * (Y - ymean)).sum()
xstd = np.sqrt(((X - xmean)**2).sum())
ystd = np.sqrt(((Y - ymean)**2).sum())
corr = cov / (xstd * ystd)
print(corr)

xvar = ((X - xmean)**2).sum()
yvar = ((Y - ymean)**2).sum()
corr = cov / np.sqrt(xvar * yvar)
print(corr)

print(np.corrcoef(X, Y))

In [None]:
# From scipy 
from scipy.stats import pearsonr

# https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302480
def pearson_correlation_coefficient(y_true, y_pred):
    """
    Custom pearson correlation coefficient metric for LGBMRegressor
    """
    return 'pearson_corr_coeff', pearsonr(y_true, y_pred)[0], True


pearsonr(X, Y)

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def read_dataset():
    return pd.read_parquet('/kaggle/input/ubiquant-dataset-compressed/ubiquant_dataset_compressed.parquet')

In [None]:
df = read_dataset()

In [None]:
df.info()

In [None]:
# # TODO: temporary reduce size while developing: iterate faster and avoid OOM errors
# df = df[:1_000_000]
# gc.collect()

In [None]:
df = reduce_mem_usage(df)
gc.collect()

In [None]:
df.info()

# XGBoost model

In [None]:
import xgboost as xgb

## 99/1 split

In [None]:
df.drop(['row_id', 'time_id'], axis=1, inplace=True)
X = df.drop(['target'], axis=1)
y = df["target"]
del df
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.01, random_state=42, shuffle=False)
del X
del y

In [None]:
# split_point = int(0.99 * len(df))
# train_df = df[:split_point]
# valid_df = df[split_point:]

# y_train = train_df.target
# X_train = train_df.iloc[:, 4:]
# y_valid = valid_df.target
# X_valid = valid_df.iloc[:, 4:]

In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=12,
    subsample=0.9,
    colsample_bytree=0.7,
    tree_method='gpu_hist'  
    )

In [None]:
model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_valid, y_valid)], verbose=1)

In [None]:
model.save_model('xgboost1')

## K-fold splits

From the exploration part, it is decided to use Group k fold on the time id

In [None]:
features_columms = list(df.filter(like='f_').columns)

In [None]:
kfold = GroupKFold(5)

In [None]:
for fold, (trn_ind, val_ind) in enumerate(kfold.split(df, groups=df.time_id)):
    print(fold, trn_ind, val_ind)
    break

In [None]:
model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=12,
    subsample=0.9,
    colsample_bytree=0.7,
#     tree_method='gpu_hist'
    )

In [None]:
model.fit(df.loc[trn_ind, features_columms], df.loc[trn_ind, 'target'], 
          early_stopping_rounds=3, 
          eval_set=[(df.loc[val_ind, features_columms], df.loc[val_ind, 'target'])], 
          verbose=1)

In [None]:
preds = model.predict(np.float32(df.loc[val_ind, features_columms]))
targets = df.loc[val_ind, 'target']

In [None]:
pearsonr(targets, preds)[0]

# Neural networks

From this notebooks there are the best models:
https://www.kaggle.com/code/diedioskuren/ubiquant-more-models-ensemble


It is commented that NN models are working better for this competition than gradient boosted models, and that the scope of the competition is not about time series models:

> [xiaowucen](https://www.kaggle.com/competitions/ubiquant-market-prediction/discussion/314690): Because many sequence-related 'factors' should have been constructed in the features provided by Ubiquant,they've done most of the work a Quant needs to do, and focused on finding algorithmic talent

> [Why MLP get the best scores?](https://www.kaggle.com/competitions/ubiquant-market-prediction/discussion/314419). Some references for the above:
> * https://towardsdatascience.com/deep-learning-in-finance-9e088cb17c03

> [Stocks Mapping Dataframe, Topic author says:](https://www.kaggle.com/competitions/ubiquant-market-prediction/discussion/315560) The design of this competition has been to purposefully stunt development of effective time series algorithms (poor API development allowing for lag exploit, poor time_id sampling to kill all memory models, irregular investment_id selection) for algorithmic brilliance.

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

In [None]:
# TF Dataset
def preprocess(X, y):
#     print(X)
#     print(y)
    return X, y

def make_dataset(features, y, batch_size=1024, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices((features, y))
    ds = ds.map(preprocess)
    if mode == "train":
        ds = ds.shuffle(256)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [None]:
def get_model():
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dropout(0.65)(feature_x)
    
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(feature_x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dropout(0.85)(x)
    output = layers.Dense(1)(x)
    
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', "mae", rmse])
    
    return model

In [None]:
model = get_model()

In [None]:
keras.utils.plot_model(model, show_shapes=True)

## 99/1 split

In [None]:
df.drop(['row_id', 'time_id'], axis=1, inplace=True)
X = df.drop(['target'], axis=1)
y = df["target"]
del df
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.01, random_state=42, shuffle=False)
del X
del y
gc.collect()

In [None]:
features_columms = list(X_train.filter(like='f_').columns)

In [None]:
train_ds = make_dataset(X_train[features_columms], y_train)

In [None]:
valid_ds = make_dataset(X_valid[features_columms], y_valid, mode="valid")

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(f"keras_model_0", save_best_only=True)
early_stop = keras.callbacks.EarlyStopping(patience=10)
history = model.fit(train_ds, 
                    epochs=30, 
                    validation_data=valid_ds, 
                    callbacks=[checkpoint, early_stop])

In [None]:
pearsonr(model.predict(valid_ds).ravel(), y_valid.values)[0]

In [None]:
!zip -r keras_model_0.zip keras_model_0

# Other work

## Splitting the data by each investment

In [None]:
dict_dfs = dict()

for ix in tqdm(df.investment_id.unique()):
    dict_dfs[ix] = df[df.investment_id == ix]


In [None]:
list(dict_dfs.keys())[:10]

In [None]:
df1 = dict_dfs[1]

In [None]:
df1.target

In [None]:
df1_target_shift = df1.target.shift(1)
df1_target_shift

In [None]:
df1

In [None]:
df1.drop(columns=['row_id', 'time_id', 'investment_id', 'target'], inplace=True)
df1

In [None]:
T = 100
samples = 200

# df1[0:100]   
# df1[1:101]
# ..
# df1[100:200]


X_df1 = np.empty(((samples, T, 300)), dtype=float)

for n in range(samples):
    X_df1[n] = df1[n:n + T]

In [None]:
# lets take 10 investments

X_array = np.empty(((10, samples, T, 300)), dtype=float)

for i, ix in enumerate(list(dict_dfs.keys())[:10]):
#     print(i, ix)
    for n in range(samples):
        X_array[i, n] = df1[n:n + T]
    
# esto se come unos 600 megas. No es escalabe. Se puede hacer un training loop personalizado

In [None]:
X_df1[1]

# Try a TPU training

In [None]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

### For using private datasets with TPU

https://www.kaggle.com/product-feedback/163416


Reading files and tutorials:

https://www.kaggle.com/docs/tpu

https://www.kaggle.com/mgornergoogle/getting-started-with-100-flowers-on-tpu/

In [None]:
# Only run this if using a private dataset
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()
user_secrets.set_tensorflow_credential(user_credential)

In [None]:
from kaggle_datasets import KaggleDatasets
GCS_DS_PATH = KaggleDatasets().get_gcs_path('ubiquant-dataset-compressed')

In [None]:
GCS_DS_PATH

In [None]:
# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    model = tf.keras.Sequential(layers=[
    layers.Dense(2500, input_shape=(300,), activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1400, activation='relu'),
    layers.Dense(800, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(500, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(500, 
                 activation='relu', 
                 activity_regularizer=tf.keras.regularizers.l1(0.01)),
    layers.Dense(1)
    ])

    model.compile(loss='mse', 
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0015),
              metrics=["mse"])
    


**`Steps_per_epoch`**:

https://datascience.stackexchange.com/questions/47405/what-to-set-in-steps-per-epoch-in-keras-fit-generator/87289#87289?s=f6c6de13fc6743f28546360a02a6299a 

In [None]:


# train model normally
# model.fit(training_dataset, epochs=5, steps_per_epoch=32)

model.fit(X_train, y_train, 
          epochs=5, 
          callbacks=[check_point_callback, early_stop_callback],
          validation_data=(X_test, y_test),
          steps_per_epoch=3907,
         )

In [None]:
BATCH_SIZE = 16 * tpu_strategy.num_replicas_in_sync

In [None]:
tpu_strategy.num_replicas_in_sync

# A CNN

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import regularizers

import numpy as np
import pandas as pd

import gc

In [None]:
def read_dataset():
    return pd.read_parquet('/kaggle/input/ubiquant-dataset-compressed/ubiquant_dataset_compressed.parquet')

In [None]:
df = read_dataset()

In [None]:
df

In [None]:
df.drop(columns=['row_id'], inplace=True)
df

In [None]:
gc.collect()

In [None]:
df.groupby(df.investment_id).size().sort_values()[0:50].index

In [None]:
df.groupby(df.investment_id).size().sort_values(ascending=False)[2500:2510]

Around 800 time steps minimum for the top 2500 investments

In [None]:
top_2500_assets_ids = df.groupby(df.investment_id).size().sort_values(ascending=False)[:2500].index
# top_2500_assets = df.groupby(df.investment_id).size().sort_values(ascending=False)[:2500]

In [None]:
df_top_2500_assets = df[df.investment_id.isin(top_2500_assets_ids)]
df_top_2500_assets

In [None]:
del df

In [None]:
gc.collect()

In [None]:
df_top_2500_assets.info()

In [None]:
#train_df = df_top_2500_assets.groupby('investment_id').apply(lambda x: x.sample(n=600)).reset_index(drop = True)
#train_df

In [None]:
train_df = df_top_2500_assets.groupby('investment_id').apply(lambda x: x.iloc[0:600]).reset_index(drop = True)
train_df

In [None]:
train_df

In [None]:
train_df.drop(columns=['time_id', 'investment_id'], inplace = True)

In [None]:
train_df

In [None]:
train_np = train_df.to_numpy()

In [None]:
# 2500 samples, 600 time steps, 301 dimensions
train_np = train_np.reshape(2500, 600, 301)

In [None]:
train_np[0][0][0], train_np[0][1][0]

In [None]:
y_train = train_np[:, 599, 0]

In [None]:
X_train = train_np[:, 0:599, :]

In [None]:
X_train.shape, y_train.shape, train_np.shape

In [None]:
def get_CNN_model():
    input = tf.keras.layers.Input(shape=(599, 301))
    x = tf.keras.layers.Conv1D(32, 5, activation='relu')(input)
    x = tf.keras.layers.MaxPooling1D(3)(x)
    x = tf.keras.layers.Conv1D(64, 20, activation='relu')(x)
    x = tf.keras.layers.MaxPooling1D(2)(x)
    x = tf.keras.layers.Conv1D(128, 10, activation='relu')(x)
    x = tf.keras.layers.GlobalMaxPooling1D()(x)
    output = tf.keras.layers.Dense(1)(x)
    
    model = tf.keras.models.Model(input, output)
    
    return model

In [None]:
cnn_model = get_CNN_model()

In [None]:
cnn_model.summary()

In [None]:
cnn_model.compile(loss='mse', 
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0015),
              metrics=["mse"])

In [None]:
cnn_model.fit(X_train, y_train, 
          epochs=5, 
#           callbacks=[check_point_callback, early_stop_callback],
#           validation_data=(X_test, y_test),
             )