This notebook is run with the `ficc_python` package on branch `mitas_increase_sequence_length`. The `example.py` file in the package shows the query that is run to get the data file.

In [1]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict
import os
import gc
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from datetime import datetime

import lightgbm as lgb

from pytorch_lightning import seed_everything

from ficc.utils.auxiliary_variables import VERY_LARGE_NUMBER, \
                                           IDENTIFIERS, \
                                           CATEGORICAL_FEATURES, \
                                           NON_CAT_FEATURES, \
                                           BINARY, \
                                           TRADE_HISTORY, \
                                           NUM_OF_DAYS_IN_YEAR
from ficc.utils.auxiliary_functions import flatten, \
                                           list_to_index_dict
from ficc.utils.diff_in_days import diff_in_days_two_dates
from ficc.utils.trade_dict_to_list import FEATURES_IN_HISTORY, \
                                          FEATURES_TO_INDEX_IN_HISTORY, \
                                          CATEGORICAL_FEATURES_IN_HISTORY, \
                                          quantity_diff
from ficc.utils.trade_dict_to_list_mappings import TRADE_TYPE_MAPPING, \
                                                   TRADE_TYPE_CROSS_PRODUCT_MAPPING, \
                                                   RATING_TO_INT_MAPPING
from ficc.utils.related_trade import append_recent_trade_data, get_appended_feature_name

import sys
sys.path.insert(0,'../')

from trade_history_model_mitas.data_prep import get_past_trade_columns, \
                                                feature_group_as_single_feature, \
                                                limit_history_to_k_trades, \
                                                combine_two_histories_sorted_by_seconds_ago, \
                                                remove_feature_from_trade_history, \
                                                convert_trade_type_encoding_to_actual, \
                                                add_reference_data_to_trade_history, \
                                                embed_with_arrays, \
                                                add_single_trade_from_history_as_reference_features, \
                                                is_sorted
from trade_history_model_mitas.models import MultipleRecurrentL1Loss, \
                                             NNL1LossEmbeddingsWithMultipleRecurrence

from yield_spread_model_mitas.data_prep import FEATURES_AND_NAN_REPLACEMENT_VALUES, \
                                               ADDITIONAL_CATEGORICAL_FEATURES, \
                                               get_datestring_from_filename, \
                                               remove_rows_with_feature_value, \
                                               replace_rating_with_standalone_rating, \
                                               add_past_trades_info, \
                                               reverse_order_of_trade_history, \
                                               check_additional_features, \
                                               replace_nan_for_features, \
                                               encode_and_get_encoders, \
                                               encode_with_encoders
from yield_spread_model_mitas.models import single_feature_model, \
                                            RecurrentL1Loss, \
                                            NNL1LossEmbeddings, \
                                            NNL1LossEmbeddingsWithRecurrence
from yield_spread_model_mitas.train import get_train_test_data_trade_datetime, \
                                           get_train_test_data_index, \
                                           is_gpu_available, \
                                           is_mps_available, \
                                           train
from yield_spread_model_mitas.tree_models import train_lightgbm_model, \
                                                 get_all_losses_for_single_dataset

from rating_model_mitas.data_prep import read_processed_file_pickle, \
                                         remove_fields_with_single_unique_value, \
                                         remove_rows_with_nan_value

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


We use zero padding instead of nonzero padding since this gives the greatest performance for the NN models (experiments are at the bottom of this notebook).

In [2]:
# default value of 0 is chosen for settlement_date_to_calc_date because we exclude bonds that have a calc date that is fewer than 400 days into the future, and so a true value of settlement date to calc date will never be close to 0
DEFAULT_VALUES_NONZERO_PADDING = {'quantity_diff': np.log10(VERY_LARGE_NUMBER),    # model should learn that a quantity diff that is very large, i.e., current quantity is much smaller than quantity being compared to, means that the trade is less meaningful to use for pricing since the trades are very different. Could also be -np.log10(VERY_LARGE_NUMBER) for the same reason
                                  'seconds_ago': np.log10(VERY_LARGE_NUMBER)}    # model should learn that if the trade being compared to is far back in the past, then it is less meaningful to pricing the current trade
DEFAULT_VALUES_NONZERO_PADDING = defaultdict(int, DEFAULT_VALUES_NONZERO_PADDING)    # constructing a defaultdict from a dict: https://stackoverflow.com/questions/7539115/how-to-construct-a-defaultdict-from-a-dictionary

DEFAULT_VALUES_ZERO_PADDING = defaultdict(int)

DEFAULT_VALUES = DEFAULT_VALUES_ZERO_PADDING

In [3]:
NUM_TRADES_IN_TRADE_HISTORY = 32    # maximum number of past trades in the history

In [4]:
TARGET = ['yield_spread']

In [5]:
DATA_PROCESSING_FEATURES = ['trade_datetime',    # used to split the data into training and test sets
                            'settlement_date',    # used (in conjunction with calc_date) to create the settlement_date_to_calc_date feature in past trades
                            'calc_date',    # used (in conjunction with settlement_date) to create the settlement_date_to_calc_date feature in past trades
                            'calc_day_cat',    # added in the past trades
                            'days_to_calc_date',    # new feature created in this notebook to be used by related trades
                            # 'coupon_type'    # used to group related trades; currently commented out since there is only a single value of 8 present in the data
                           ]

In [6]:
%%time
processed_file_pickle = '../../../../ficc/ml_models/sequence_predictors/data/processed_data_ficc_ycl_long_history_2022-10-08-00-00.pkl'
processed_file_pickle_datestring = get_datestring_from_filename(processed_file_pickle)
trade_data = read_processed_file_pickle(processed_file_pickle)

START: Reading from processed file at ../../../../ficc/ml_models/sequence_predictors/data/processed_data_ficc_ycl_long_history_2022-10-08-00-00.pkl
END: Reading from processed file at ../../../../ficc/ml_models/sequence_predictors/data/processed_data_ficc_ycl_long_history_2022-10-08-00-00.pkl
CPU times: user 7.84 s, sys: 8.11 s, total: 16 s
Wall time: 18.6 s


Keep all trades before October 8, to standardize with Charles and Developer.

In [7]:
trade_data = trade_data[trade_data.trade_datetime < datetime(2022, 10, 8)]

Apply exclusions.

In [8]:
print(f'Total number of trades: {len(trade_data)}')

Total number of trades: 3360105


In [9]:
%%time
trade_data = trade_data[(trade_data.days_to_call == 0) | (trade_data.days_to_call > np.log10(400))]
trade_data = trade_data[(trade_data.days_to_refund == 0) | (trade_data.days_to_refund > np.log10(400))]
trade_data = trade_data[trade_data.days_to_maturity < np.log10(30000)]
trade_data = trade_data[trade_data.sinking == False]
trade_data = trade_data[trade_data.incorporated_state_code != 'VI']
trade_data = trade_data[trade_data.incorporated_state_code != 'GU']
# trade_data = trade_data[(trade_data.coupon_type == 8)]
# trade_data = trade_data[trade_data.is_called == False]

# restructured bonds and high chance of default bonds are removed
trade_data = remove_rows_with_feature_value(trade_data, 'purpose_sub_class', [6, 20, 22, 44, 57, 90])
# pre-refunded bonds and partially refunded bonds are removed
trade_data = remove_rows_with_feature_value(trade_data, 'called_redemption_type', [18, 19])

(df["purpose_sub_class"] != 6) & (df["purpose_sub_class"] != 20) & (df["purpose_sub_class"] != 22) & (df["purpose_sub_class"] != 44) & (df["purpose_sub_class"] != 57) & (df["purpose_sub_class"] != 90)
11842 rows had purpose_sub_class in [6, 20, 22, 44, 57, 90] and were removed
(df["called_redemption_type"] != 18) & (df["called_redemption_type"] != 19)
11044 rows had called_redemption_type in [18, 19] and were removed
CPU times: user 6.97 s, sys: 5.5 s, total: 12.5 s
Wall time: 13.3 s


In [10]:
trade_data = replace_rating_with_standalone_rating(trade_data)

Add `treasury_spread` to the `NON_CAT_FEATURES`.

In [11]:
NON_CAT_FEATURES.append('ficc_treasury_spread')

Add `days_to_calc_date` to `trade_data`.

In [12]:
trade_data['days_to_calc_date'] = np.log10(1 + (trade_data['calc_date'] - trade_data['settlement_date']).dt.days)
assert not trade_data['days_to_calc_date'].isnull().values.any(), f'`days_to_calc_date` is null for the following RTRS control numbers: {trade_data[trade_data["days_to_calc_date"].isnull().values, "rtrs_control_number"].values}'

In [None]:
ADDITIONAL_CATEGORICAL_FEATURES = check_additional_features(trade_data, ADDITIONAL_CATEGORICAL_FEATURES)

trade_data, _ = replace_nan_for_features(trade_data, FEATURES_AND_NAN_REPLACEMENT_VALUES, verbose=True)
trade_data = remove_fields_with_single_unique_value(trade_data, BINARY + CATEGORICAL_FEATURES + ADDITIONAL_CATEGORICAL_FEATURES + NON_CAT_FEATURES)

all_features_set = set(trade_data.columns)
BINARY = list(set(BINARY) & all_features_set)
CATEGORICAL_FEATURES = list((set(CATEGORICAL_FEATURES) | set(ADDITIONAL_CATEGORICAL_FEATURES)) & all_features_set)
NON_CAT_FEATURES = list(set(NON_CAT_FEATURES) & all_features_set)
PREDICTORS = BINARY + CATEGORICAL_FEATURES + NON_CAT_FEATURES

trade_data = trade_data[IDENTIFIERS + 
                        PREDICTORS + 
                        DATA_PROCESSING_FEATURES + 
                        TRADE_HISTORY + 
                        TARGET]

trade_data = remove_rows_with_nan_value(trade_data)

In [None]:
print(f'Identifiers: {sorted(IDENTIFIERS)}')
print(f'Predictors: {sorted(PREDICTORS)}')
print(f'Binary features: {sorted(BINARY)}')
print(f'Categorical features: {sorted(CATEGORICAL_FEATURES)}')
print(f'Numerical features: {sorted(NON_CAT_FEATURES)}')

In [None]:
PREDICTORS_WITHOUT_LAST_TRADE_FEATURES = [predictor for predictor in PREDICTORS if not predictor.startswith('last')]
print(f'The following features are in PREDICTORS but not in PREDICTORS_WITHOUT_LAST_TRADE_FEATURES: {set(PREDICTORS) - set(PREDICTORS_WITHOUT_LAST_TRADE_FEATURES)}')

In [None]:
# Ensure that the dataframe is sorted in descending order by `trade_datetime`
assert is_sorted(trade_data['trade_datetime'], ascending=False)

In [None]:
oldest_trade_datetime = trade_data['trade_datetime'].iloc[-1]
newest_trade_datetime = trade_data['trade_datetime'].iloc[0]

print(f'Oldest trade datetime: {oldest_trade_datetime}.\
    Newest trade datetime: {newest_trade_datetime}.\
    Gap: {newest_trade_datetime - oldest_trade_datetime}')
print(f'Total number of trades: {len(trade_data)}')

Create a dataset with only the reference data.

In [None]:
DATE_TO_SPLIT = datetime(2022, 9, 15)    # September 15 2022

In [None]:
train_data, test_data = get_train_test_data_trade_datetime(trade_data, DATE_TO_SPLIT)
print(f'Number of trades for training: {len(train_data)}.\
    Number of trades for testing: {len(test_data)}')
assert len(train_data) != 0 and len(test_data) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'
train_data_with_trade_history = train_data.drop(columns=DATA_PROCESSING_FEATURES + IDENTIFIERS)
test_data_with_trade_history = test_data.drop(columns=DATA_PROCESSING_FEATURES + IDENTIFIERS)
train_data_only_reference = train_data_with_trade_history.drop(columns=TRADE_HISTORY)
test_data_only_reference = test_data_with_trade_history.drop(columns=TRADE_HISTORY)

Flatten the trade history. The flattened data is used in the LightGBM models.

In [None]:
%%time
trade_data_flattened_trade_history, \
    additional_binary_features_from_past_trades, \
    additional_noncat_features_from_past_trades, \
    past_trade_feature_groups = add_past_trades_info(trade_data, NUM_TRADES_IN_TRADE_HISTORY - 1, FEATURES_TO_INDEX_IN_HISTORY)
past_trade_feature_groups_flattened = flatten(past_trade_feature_groups)
print(f'Each of the past trades are in the following feature groups: {past_trade_feature_groups}')

In [None]:
trade_data_history_and_reference_features = trade_data_flattened_trade_history[past_trade_feature_groups_flattened + PREDICTORS_WITHOUT_LAST_TRADE_FEATURES + DATA_PROCESSING_FEATURES + TARGET]
train_data_history_and_reference_features, test_data_history_and_reference_features = get_train_test_data_trade_datetime(trade_data_history_and_reference_features, DATE_TO_SPLIT)
assert len(train_data_history_and_reference_features) != 0 and len(test_data_history_and_reference_features) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'

train_data_history_and_reference_features = train_data_history_and_reference_features.drop(columns=DATA_PROCESSING_FEATURES)
train_data_only_history = train_data_history_and_reference_features.drop(columns=PREDICTORS_WITHOUT_LAST_TRADE_FEATURES)

test_data_history_and_reference_features = test_data_history_and_reference_features.drop(columns=DATA_PROCESSING_FEATURES)
test_data_only_history = test_data_history_and_reference_features.drop(columns=PREDICTORS_WITHOUT_LAST_TRADE_FEATURES)

Decode the trade type from a 2-dimensional binary list to its original value for trades in the history. For example, a trade type of `[0, 1]` would be decoded to `S`.

In [None]:
TRADE_TYPE_NEW_COLUMN = 'trade_type'

In [None]:
SAME_CUSIP_PREFIX = 'last_'

In [None]:
trade_data_history_and_reference_features_actual_trade_type, old_trade_type_columns, _ = convert_trade_type_encoding_to_actual(trade_data_history_and_reference_features, 
                                                                                                                               NUM_TRADES_IN_TRADE_HISTORY, 
                                                                                                                               TRADE_TYPE_NEW_COLUMN, 
                                                                                                                               SAME_CUSIP_PREFIX)
del trade_data_history_and_reference_features
gc.collect()

trade_data_history_and_reference_features_actual_trade_type = trade_data_history_and_reference_features_actual_trade_type.drop(columns=old_trade_type_columns)
train_data_history_and_reference_features_actual_trade_type, \
    test_data_history_and_reference_features_actual_trade_type = get_train_test_data_trade_datetime(trade_data_history_and_reference_features_actual_trade_type, DATE_TO_SPLIT)
del trade_data_history_and_reference_features_actual_trade_type
gc.collect()
assert len(train_data_history_and_reference_features_actual_trade_type) != 0 and len(test_data_history_and_reference_features_actual_trade_type) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'

train_data_history_and_reference_features_actual_trade_type = train_data_history_and_reference_features_actual_trade_type.drop(columns=DATA_PROCESSING_FEATURES)
train_data_only_history_actual_trade_type = train_data_history_and_reference_features_actual_trade_type.drop(columns=PREDICTORS_WITHOUT_LAST_TRADE_FEATURES)

test_data_history_and_reference_features_actual_trade_type = test_data_history_and_reference_features_actual_trade_type.drop(columns=DATA_PROCESSING_FEATURES)
test_data_only_history_actual_trade_type = test_data_history_and_reference_features_actual_trade_type.drop(columns=PREDICTORS_WITHOUT_LAST_TRADE_FEATURES)

Check that the above procedure removed just one feature for each past trade (`trade_type1` and `trade_type2` combine to just `trade_type`)

In [None]:
assert len(test_data_only_history_actual_trade_type.columns) == len(test_data_only_history.columns) - NUM_TRADES_IN_TRADE_HISTORY, f'Before converting, the dataframe had {len(test_data_only_history.columns)} columns, and after converting, the dataframe has {len(test_data_only_history_actual_trade_type.columns)} columns'

# Last Yield Spread
Weakest baseline where output it just the yield spread of the most previous trade in the same CUSIP.

In [None]:
single_feature_model(trade_data, 'last_yield_spread')

In [None]:
single_feature_model(train_data_only_history, 'last_yield_spread')

In [None]:
single_feature_model(test_data_only_history, 'last_yield_spread')

# LightGBM (only reference data)
LightGBM baseline with just the reference data.

In [None]:
lgb_model, lgb_losses = train_lightgbm_model(train_data_only_reference, 
                                             test_data_only_reference, 
                                             CATEGORICAL_FEATURES, 
                                             wandb_project='mitas_trade_history')

In [None]:
lgb.plot_importance(lgb_model, figsize=(20, 10), importance_type='gain')

# Feedforward NN (only reference data)

In [None]:
make_data_filename = lambda name: f'data/{name}.pkl'    # used to create a filename to save the data files
if not os.path.isdir('data/'):
    os.mkdir('data/')

Perform encoding of categorical features.

In [None]:
%%time
train_data_only_reference_encoded, label_encoders = encode_and_get_encoders(train_data_only_reference, BINARY, CATEGORICAL_FEATURES)

label_encoders_filepath = make_data_filename('label_encoders')
with open(label_encoders_filepath, 'wb') as pickle_handle: pickle.dump(label_encoders, pickle_handle, protocol=4)    # protocol 4 allows for use in the VM; use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object
encode_with_label_encoders = lambda df, features_to_exclude=[]: encode_with_encoders(df, label_encoders, features_to_exclude)

test_data_only_reference_encoded = encode_with_label_encoders(test_data_only_reference)
train_data_with_trade_history_encoded = encode_with_label_encoders(train_data_with_trade_history)
test_data_with_trade_history_encoded = encode_with_label_encoders(test_data_with_trade_history)

Set up NN training.

In [None]:
make_filename = lambda name: f'pt/{name}.pt'    # used to create a filename to save the PyTorch model parameters
if not os.path.isdir('pt/'):
    os.mkdir('pt/')

In [None]:
BATCH_SIZE = 1000
NUM_WORKERS = 8 if is_gpu_available() or is_mps_available() else 0
NUM_EPOCHS = 100

SEED = 1
seed_everything(SEED, workers=True)

Use the values giving the highest accuracy (from a very limited hyperparameter search) on the reference data from `yield_spread_model_mitas/yield_spread_model.ipynb`.

In [None]:
NUM_HIDDEN_LAYERS = 3
NUM_NODES_HIDDEN_LAYER = 600
EMBEDDINGS_POWER = 0.5

In [None]:
nn_name = f'embeddings_power={EMBEDDINGS_POWER}_{NUM_HIDDEN_LAYERS}_hidden_layers_{NUM_NODES_HIDDEN_LAYER}_nodes_per_layer_{NUM_EPOCHS}_epochs'

In [None]:
%%time
train(NNL1LossEmbeddings(BATCH_SIZE, 
                         NUM_WORKERS, 
                         train_data_only_reference_encoded, 
                         test_data_only_reference_encoded, 
                         label_encoders, 
                         CATEGORICAL_FEATURES, 
                         NUM_NODES_HIDDEN_LAYER, 
                         NUM_HIDDEN_LAYERS, 
                         power=EMBEDDINGS_POWER), 
      NUM_EPOCHS, 
      model_filename=make_filename(nn_name), 
      save=False, 
      print_losses_before_training=False,    # setting this to True may cause the kernel to crash
      print_losses_after_training=False,    # setting this to True may cause the kernel to crash
      wandb_logging_name=nn_name)

# LightGBM
Train LightGBM models that vary the number of trades in the history for the same CUSIP.
## Only trade history (no reference features)

In [None]:
def train_lightgbm_model_only_history(num_past_trades):
    past_trade_columns, all_categorical_features_in_trade_history = get_past_trade_columns(num_past_trades, 
                                                                                           FEATURES_IN_HISTORY, 
                                                                                           SAME_CUSIP_PREFIX, 
                                                                                           trade_type_actual=True, 
                                                                                           trade_type_column=TRADE_TYPE_NEW_COLUMN, 
                                                                                           categorical_features_per_trade=CATEGORICAL_FEATURES_IN_HISTORY)
    return train_lightgbm_model(train_data_only_history_actual_trade_type[past_trade_columns + TARGET], 
                                test_data_only_history_actual_trade_type[past_trade_columns + TARGET], 
                                all_categorical_features_in_trade_history, 
                                wandb_project='mitas_trade_history')

In [None]:
%%time
num_past_trades_in_history_candidates = list(range(1, NUM_TRADES_IN_TRADE_HISTORY + 1))    # maximum number of past trades is NUM_TRADES_IN_TRADE_HISTORY (32), since that is the number of past trades in the data pipeline
train_l1_losses, test_l1_losses = [], []
for num_past_trades in num_past_trades_in_history_candidates:
    lgb_model, lgb_losses = train_lightgbm_model_only_history(num_past_trades)
    if num_past_trades == 16: lgb_model_16, lgb_model_16_losses = lgb_model, lgb_losses
    train_l1_losses.append(lgb_losses['Train'][0])    # index 0 indicates l1 loss
    test_l1_losses.append(lgb_losses['Test'][0])    # index 0 indicates l1 loss

In [None]:
if 16 <= NUM_TRADES_IN_TRADE_HISTORY:
    if 'lgb_model_16' not in locals(): lgb_model_16, lgb_model_16_losses = train_lightgbm_model_only_history(16)    # if model not created, then train the lightgbm model
    lgb.plot_importance(lgb_model_16, figsize=(20, 25), importance_type='gain')

In [None]:
print(f'Train error for 5 trades: {train_l1_losses[4]}')
print(f'Minimum value: {min(train_l1_losses)} for number of trades: {num_past_trades_in_history_candidates[np.argmin(train_l1_losses)]}')
plt.ylabel('Train L1 losses')
plt.xlabel('Number of trades in history')
plt.plot(num_past_trades_in_history_candidates, train_l1_losses, 'o')

In [None]:
print(f'Test error for 5 trades: {test_l1_losses[4]}')
print(f'Minimum value: {min(test_l1_losses)} for number of trades: {num_past_trades_in_history_candidates[np.argmin(test_l1_losses)]}')
plt.ylabel('Test L1 losses')
plt.xlabel('Number of trades in history')
plt.plot(num_past_trades_in_history_candidates, test_l1_losses, 'o')

## Reference features and trade history

In [None]:
def train_lightgbm_model_history_and_reference(num_past_trades):
    past_trade_columns, all_categorical_features_in_trade_history = get_past_trade_columns(num_past_trades, 
                                                                                           FEATURES_IN_HISTORY, 
                                                                                           SAME_CUSIP_PREFIX, 
                                                                                           trade_type_actual=True, 
                                                                                           trade_type_column=TRADE_TYPE_NEW_COLUMN, 
                                                                                           categorical_features_per_trade=CATEGORICAL_FEATURES_IN_HISTORY)
    return train_lightgbm_model(train_data_history_and_reference_features_actual_trade_type[PREDICTORS_WITHOUT_LAST_TRADE_FEATURES + past_trade_columns + TARGET], 
                                test_data_history_and_reference_features_actual_trade_type[PREDICTORS_WITHOUT_LAST_TRADE_FEATURES + past_trade_columns + TARGET], 
                                CATEGORICAL_FEATURES + all_categorical_features_in_trade_history, 
                                wandb_project='mitas_trade_history')

In [None]:
%%time
num_past_trades_in_history_candidates = list(range(1, NUM_TRADES_IN_TRADE_HISTORY + 1))    # maximum number of past trades is NUM_TRADES_IN_TRADE_HISTORY (32), since that is the number of past trades in the data pipeline
train_l1_losses, test_l1_losses = [], []
for num_past_trades in num_past_trades_in_history_candidates:
    lgb_model, lgb_losses = train_lightgbm_model_history_and_reference(num_past_trades)
    if num_past_trades == 16: lgb_model_16, lgb_model_16_losses = lgb_model, lgb_losses
    train_l1_losses.append(lgb_losses['Train'][0])    # index 0 indicates l1 loss
    test_l1_losses.append(lgb_losses['Test'][0])    # index 0 indicates l1 loss

In [None]:
if 16 <= NUM_TRADES_IN_TRADE_HISTORY:
    if 'lgb_model_16' not in locals(): lgb_model_16, lgb_model_16_losses = train_lightgbm_model_history_and_reference(16)    # if model not created, then train the lightgbm model
    print(f'Train error: {lgb_model_16_losses["Train"][0]}\tTest error: {lgb_model_16_losses["Test"][0]}')
    lgb.plot_importance(lgb_model_16, figsize=(20, 25), importance_type='gain')

In [None]:
print(f'Train error for 5 trades: {train_l1_losses[4]}')
print(f'Minimum value: {min(train_l1_losses)} for number of trades: {num_past_trades_in_history_candidates[np.argmin(train_l1_losses)]}')
plt.ylabel('Train L1 losses')
plt.xlabel('Number of trades in history')
plt.plot(num_past_trades_in_history_candidates, train_l1_losses, 'o')

In [None]:
print(f'Test error for 5 trades: {test_l1_losses[4]}')
print(f'Minimum value: {min(test_l1_losses)} for number of trades: {num_past_trades_in_history_candidates[np.argmin(test_l1_losses)]}')
plt.ylabel('Test L1 losses')
plt.xlabel('Number of trades in history')
plt.plot(num_past_trades_in_history_candidates, test_l1_losses, 'o')

Our current design looks at just the 5 previous trades, but this plot indicates that having more trades in the history increases predictive power.

# Add histories of related trades
For each trade, we find the `NUM_TRADES_IN_RELATED_TRADE_HISTORY` most recent related trades (up until the `trade_datetime` of the current trade) that are from different CUSIPs.

In [None]:
NUM_TRADES_IN_RELATED_TRADE_HISTORY = 1

Use certain reference data columns to data in order to augment the same CUSIP history. These will be added when selecting the features to use in the LightGBM model.

In [None]:
CATEGORICAL_REFERENCE_FEATURES_TO_ADD = ['rating', 'incorporated_state_code', 'purpose_sub_class']    # choosing a few features from the most important features for the LightGBM model on just reference data
CATEGORICAL_REFERENCE_FEATURES_TO_ADD = list(set(CATEGORICAL_REFERENCE_FEATURES_TO_ADD) & set(trade_data.columns))    # make sure that all CATEGORICAL_REFERENCE_FEATURES_TO_ADD are in the trade data as columns
print(f'Including the following reference features for each related trade and the target trade: {CATEGORICAL_REFERENCE_FEATURES_TO_ADD}')

Add this same reference data to related trade history. However, to make this code faster, we first encode the reference data, and then decode after creating the past trade history. This creates a speedup because `append_recent_trade_data(...)` is significantly faster when working with numerical data as opposed to objects, due to how numpy handles `dtype='O'` versus `dtype='np.float_'`.

In [None]:
ENCODE_REFERENCE_FEATURES = False    # boolean variable that determines whether trade history will contain categorical features that must be encoded before adding these features to the trade history

In [None]:
for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD:
    if feature not in FEATURES_TO_INDEX_IN_HISTORY: FEATURES_TO_INDEX_IN_HISTORY[feature] = len(FEATURES_TO_INDEX_IN_HISTORY)
    ENCODE_REFERENCE_FEATURES = True

In [None]:
related_trade_feature_prefix = 'related_last_'
get_neighbor_feature = lambda feature: lambda curr, neighbor: neighbor[feature]
RELATED_TRADE_FEATURE_FUNCTIONS = {'yield_spread': get_neighbor_feature('yield_spread'), 
                                   'treasury_spread': get_neighbor_feature('ficc_treasury_spread'), 
                                   'quantity': get_neighbor_feature('quantity'), 
                                   'quantity_diff': lambda curr, neighbor: quantity_diff(10 ** neighbor['quantity'] - 10 ** curr['quantity']), 
                                   'trade_type1': lambda curr, neighbor: TRADE_TYPE_MAPPING[neighbor['trade_type']][0], 
                                   'trade_type2': lambda curr, neighbor: TRADE_TYPE_MAPPING[neighbor['trade_type']][1], 
                                   'seconds_ago': lambda curr, neighbor: np.log10(1 + (curr['trade_datetime'] - neighbor['trade_datetime']).total_seconds()), 
                                   'settlement_date_to_calc_date': lambda curr, neighbor: np.log10(1 + diff_in_days_two_dates(neighbor['calc_date'], neighbor['settlement_date'], convention='exact')), 
                                   'calc_day_cat': get_neighbor_feature('calc_day_cat'), 
                                   'trade_type_past_latest': lambda curr, neighbor: TRADE_TYPE_CROSS_PRODUCT_MAPPING[neighbor['trade_type'] + curr['trade_type']], 
                                  #  'rating_diff': lambda curr, neighbor: RATING_TO_INT_MAPPING[curr['rating']] - RATING_TO_INT_MAPPING[neighbor['rating']]
                                   }

related_trades_features_wo_reference_features_groups = [[get_appended_feature_name(idx, feature, related_trade_feature_prefix) for feature in RELATED_TRADE_FEATURE_FUNCTIONS] 
                                                                 for idx in range(NUM_TRADES_IN_RELATED_TRADE_HISTORY)]    # insertion order of the dictionary is preserved for Python v3.7+
related_trades_features_wo_reference_features = flatten(related_trades_features_wo_reference_features_groups)

additional_related_trade_functions = {'same_day': lambda curr, neighbor: int(neighbor['trade_datetime'].date() == curr['trade_datetime'].date()),    # used to track additional information about the related trades; compare date only instead of entire datetime: https://stackoverflow.com/questions/3743222/how-do-i-convert-a-datetime-to-date
                                      'days_to_calc_date': get_neighbor_feature('days_to_calc_date')}

reference_features_to_add_functions = {feature: get_neighbor_feature(feature) for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD}
RELATED_TRADE_FEATURE_FUNCTIONS = RELATED_TRADE_FEATURE_FUNCTIONS | additional_related_trade_functions | reference_features_to_add_functions    # combine two dictionaries together for Python v3.9+: https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression

print(f'Related trades will have the following features: {RELATED_TRADE_FEATURE_FUNCTIONS.keys()}')

related_trades_features_groups = [[get_appended_feature_name(idx, feature, related_trade_feature_prefix) for feature in RELATED_TRADE_FEATURE_FUNCTIONS] 
                                   for idx in range(NUM_TRADES_IN_RELATED_TRADE_HISTORY)]    # insertion order of the dictionary is preserved for Python v3.7+
related_trades_features = flatten(related_trades_features_groups)

In [None]:
# dictionary format. key: name of the feature; value: two-item tuple where the first item is a function of the current trade and related trade, and the second item is the default value to be filled in if that value does not exist
RELATED_TRADE_FEATURE_FUNCTIONS_AND_DEFAULT_VALUES = {key: (function, DEFAULT_VALUES[key]) for key, function in RELATED_TRADE_FEATURE_FUNCTIONS.items()}

See which features are the in the same CUSIP trade history and the related trade history.

In [None]:
# assert FEATURES_IN_HISTORY == [key for key in RELATED_TRADE_FEATURE_FUNCTIONS if key not in additional_related_trade_functions and key not in reference_features_to_add_functions]    # insertion order of the dictionary is preserved for Python v3.7+ so this will check if the ordering of the keys are the same
print(f'Each trade in the same CUSIP trade history has the following features: {FEATURES_IN_HISTORY}')
print(f'Each trade in the related trade history has the following features: {RELATED_TRADE_FEATURE_FUNCTIONS.keys()}')

Create "quantized features" which groups together certain values of the features when used to make related trades. For example, `RATING_WITHOUT_PLUS_MINUS` removes the + and - from ratings, and so a bond with rating A+ will be related to a bond with rating A or A-.

Purpose class was added as a quantized feature based on a call with Desmond Dahill from Tegus on 09/27/2022.

In [None]:
%%time
epsilon = 1 / VERY_LARGE_NUMBER

RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED = 'rating_without_+-_b_nr_combined'
trade_data_flattened_trade_history[RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED] = trade_data_flattened_trade_history['rating'].transform(lambda rating: str.rstrip(rating, '+-'))    # remove + and - from right side of string
# group BBB, BB, B, and NR together since each have a very small number of trades
b_ratings = trade_data_flattened_trade_history[RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED].isin(['B', 'BB', 'BBB', 'NR'])
trade_data_flattened_trade_history.loc[b_ratings, RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED] = 'B'
print(f'Created {RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED} feature')

DAYS_TO_MATURITY_CATEGORICAL = 'days_to_maturity_categorical'
num_of_days_bins_maturity = [np.log10(days) for days in [epsilon, NUM_OF_DAYS_IN_YEAR * 2, NUM_OF_DAYS_IN_YEAR * 5, NUM_OF_DAYS_IN_YEAR * 10, VERY_LARGE_NUMBER]]    # 2 years, 5 years, 10 years; arbitrarily chosen
trade_data_flattened_trade_history[DAYS_TO_MATURITY_CATEGORICAL] = pd.cut(trade_data_flattened_trade_history['days_to_maturity'], num_of_days_bins_maturity).astype('string')
print(f'Created {DAYS_TO_MATURITY_CATEGORICAL} feature')

DAYS_TO_CALL_CATEGORICAL = 'days_to_call_categorical'
num_of_days_bins_call = [np.log10(days) for days in [epsilon, NUM_OF_DAYS_IN_YEAR * 2, NUM_OF_DAYS_IN_YEAR * 5, VERY_LARGE_NUMBER]]    # 2 years, 5 years; arbitrarily chosen
trade_data_flattened_trade_history[DAYS_TO_CALL_CATEGORICAL] = pd.cut(trade_data_flattened_trade_history['days_to_call'], num_of_days_bins_call).astype('string')
print(f'Created {DAYS_TO_CALL_CATEGORICAL} feature')

COUPON_CATEGORICAL = 'coupon_categorical'
coupon_bins = [0, 3, 4, 4.5, 5.0 + epsilon, VERY_LARGE_NUMBER]   # 0 - 2.99, 3 - 3.99, 4 - 4.49, 4.5 - 5; from discussion with a team member
trade_data_flattened_trade_history[COUPON_CATEGORICAL] = pd.cut(trade_data_flattened_trade_history['coupon'], coupon_bins, right=False).astype('string')
print(f'Created {COUPON_CATEGORICAL} feature')

COUPON_CATEGORICAL_SUDHAR = 'coupon_categorical_sudhar'
coupon_bins = [0, 3, 4, 4.5, 5, 5.25, 5.5, 6, VERY_LARGE_NUMBER]    # from Sudhar's paper: Kolm, Purushothaman. 2021. Systematic Pricing and Trading of Municipal Bonds
trade_data_flattened_trade_history[COUPON_CATEGORICAL_SUDHAR] = pd.cut(trade_data_flattened_trade_history['coupon'], coupon_bins, right=False).astype('string')
print(f'Created {COUPON_CATEGORICAL_SUDHAR} feature')

# COUPON_TOP_VALUES = 'coupon_top_values'
# trade_data_flattened_trade_history[COUPON_TOP_VALUES] = trade_data_flattened_trade_history['coupon']
# top4_coupon_values = trade_data_flattened_trade_history['coupon'].value_counts().head(4).index.tolist()    # select the top 4 coupon values based on frequency in the data, which are: 5.0, 4.0, 3.0, 2.0 comprising about 90% of the data
# trade_data_flattened_trade_history.loc[~trade_data_flattened_trade_history['coupon'].isin(top4_coupon_values), COUPON_TOP_VALUES] = -1    # arbitrary numerical value that is invalid as a coupon value
# print(f'Created {COUPON_TOP_VALUES} feature')

PURPOSE_CLASS_TOP_VALUES = 'purpose_class_top_values'
trade_data_flattened_trade_history[PURPOSE_CLASS_TOP_VALUES] = trade_data_flattened_trade_history['purpose_class']
top6_purpose_class_values = trade_data_flattened_trade_history['purpose_class'].value_counts().head(6).index.tolist()    # select the top 6 coupon values based on frequency in the data, which are: 37 (school district), 51 (various purpose), 50 (utility), 46 (tax revenue), 9 (education), 48 (transportation) comprising about 80% of the data
trade_data_flattened_trade_history.loc[~trade_data_flattened_trade_history['purpose_class'].isin(top6_purpose_class_values), PURPOSE_CLASS_TOP_VALUES] = -1    # arbitrary numerical value that is invalid as a purpose_class value
print(f'Created {PURPOSE_CLASS_TOP_VALUES} feature')

MUNI_SECURITY_TYPE_TOP_VALUES = 'muni_security_type_top_values'
trade_data_flattened_trade_history[MUNI_SECURITY_TYPE_TOP_VALUES] = trade_data_flattened_trade_history['muni_security_type']
top6_muni_security_type_values = trade_data_flattened_trade_history['muni_security_type'].value_counts().head(2).index.tolist()    # select the top 2 coupon values based on frequency in the data, which are: 8 (revenue), 5 (unlimited g.o.) comprising about 80% of the data
trade_data_flattened_trade_history.loc[~trade_data_flattened_trade_history['muni_security_type'].isin(top6_muni_security_type_values), MUNI_SECURITY_TYPE_TOP_VALUES] = -1    # arbitrary numerical value that is invalid as a purpose_class value
print(f'Created {MUNI_SECURITY_TYPE_TOP_VALUES} feature')

TRADE_DATETIME_DAY = 'trade_datetime_day'
trade_data_flattened_trade_history[TRADE_DATETIME_DAY] = trade_data_flattened_trade_history['trade_datetime'].transform(lambda datetime: datetime.date()).astype('string')    # remove timestamp from datetime
print(f'Created {TRADE_DATETIME_DAY} feature')

QUANTITY_CATEGORICAL = 'quantity_categorical'
quantity_bins = [0, 5, 6, 7, VERY_LARGE_NUMBER]    # 0 - 100k, 100k - 1m, 1m - 10m, 10m+
trade_data_flattened_trade_history[QUANTITY_CATEGORICAL] = pd.cut(trade_data_flattened_trade_history['quantity'], quantity_bins).astype('string')
print(f'Created {QUANTITY_CATEGORICAL} feature')

In [None]:
quantized_features = [RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, 
                      DAYS_TO_MATURITY_CATEGORICAL, 
                      DAYS_TO_CALL_CATEGORICAL, 
                      COUPON_CATEGORICAL, 
                      COUPON_CATEGORICAL_SUDHAR, 
                      PURPOSE_CLASS_TOP_VALUES, 
                      MUNI_SECURITY_TYPE_TOP_VALUES, 
                    #   COUPON_TOP_VALUES, 
                      TRADE_DATETIME_DAY, 
                      QUANTITY_CATEGORICAL]

Make sure that each category (for each quantized feature) has a reasonable number of trades.

In [None]:
for feature in quantized_features:
    trade_data_flattened_trade_history[feature].value_counts().plot(kind='bar', title=feature, figsize=(20, 10))
    plt.show()

In [None]:
quantity_greater_than_100k = lambda row: row['quantity'] >= np.log10(1e5)
quantity_greater_than_1m = lambda row: row['quantity'] >= np.log10(1e6)
trade_type_is_interdealer = lambda row: row['trade_type'] == 'D'

This link has the below definitions and results: https://docs.google.com/document/d/1rQeB3lM_iEyv9q-rseQPmb8n8nv1ay5UtVdSNH0K0QU/edit?usp=sharing.

In [None]:
# key: name of criteria, value: (categories to match, filtering conditions)
related_trades_criterion = {# 'sudhar1': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], []), 
                            # 'sudhar1_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], [quantity_greater_than_100k]), 
                            # 'sudhar1_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], [quantity_greater_than_1m]), 
                            'sudhar2': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, 'trade_type'], []), 
                            # 'sudhar2_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, 'trade_type'], [quantity_greater_than_100k]), 
                            # 'sudhar2_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, 'trade_type'], [quantity_greater_than_1m]), 
                            # 'sudhar3': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], [trade_type_is_interdealer]), 
                            # 'sudhar3_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], [quantity_greater_than_100k, trade_type_is_interdealer]), 
                            # 'sudhar3_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], [quantity_greater_than_1m, trade_type_is_interdealer]), 
                            # 'sudhar4': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, MUNI_SECURITY_TYPE_TOP_VALUES], []), 
                            # 'sudhar4_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, MUNI_SECURITY_TYPE_TOP_VALUES], [quantity_greater_than_100k]), 
                            # 'sudhar4_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, MUNI_SECURITY_TYPE_TOP_VALUES], [quantity_greater_than_1m]), 
                            # 'sudhar5': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], []), 
                            # 'sudhar5_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], [quantity_greater_than_100k]), 
                            # 'sudhar5_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], [quantity_greater_than_1m]), 
                            # 'sudhar6': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], []), 
                            # 'sudhar6_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], [quantity_greater_than_100k]), 
                            # 'sudhar6_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], [quantity_greater_than_1m]), 
                            # 'mitas1': ([TRADE_DATETIME_DAY, 'trade_type'], []),
                            # 'mitas1_100k': ([TRADE_DATETIME_DAY, 'trade_type'], [quantity_greater_than_100k]), 
                            # 'mitas1_1m': ([TRADE_DATETIME_DAY, 'trade_type'], [quantity_greater_than_1m]), 
                            # 'desmond': (['incorporated_state_code', RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL, COUPON_CATEGORICAL, PURPOSE_CLASS_TOP_VALUES], []), 
                            # 'desmond_100k': (['incorporated_state_code', RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL, COUPON_CATEGORICAL, PURPOSE_CLASS_TOP_VALUES], [quantity_greater_than_100k]), 
                            # 'desmond_1m': (['incorporated_state_code', RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL, COUPON_CATEGORICAL, PURPOSE_CLASS_TOP_VALUES], [quantity_greater_than_1m]), 
                            # 'yellow': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL, QUANTITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL], []), 
                            # 'yellow_100k': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL, QUANTITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL], [quantity_greater_than_100k]), 
                            # 'yellow_1m': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL, QUANTITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL], [quantity_greater_than_1m]), 
                            # 'yellow_lite': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL], []), 
                            # 'yellow_lite_100k': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL], [quantity_greater_than_100k]), 
                            # 'yellow_lite_1m': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL], [quantity_greater_than_1m]), 
                            }

In [None]:
# key: name of criteria, value: (categories to match, filtering conditions)
# combine two dictionaries together for Python v3.9+: https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression
related_trades_criterion = related_trades_criterion | \
                           {# 'NONE': ([], []), 
                            # 'trade_type': (['trade_type'], []), 
                            # 'incorporated_state_code': (['incorporated_state_code'], []), 
                            # 'days_to_maturity_categorical': ([DAYS_TO_MATURITY_CATEGORICAL], []), 
                            # 'quantity_categorical': ([QUANTITY_CATEGORICAL], []), 
                            # 'coupon_categorical': ([COUPON_CATEGORICAL], []), 
                            # 'trade_datetime_day': ([TRADE_DATETIME_DAY], []), 
                            # 'rating_without_plus_minus_B_NR_combined': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED], []), 
                            # 'days_to_call': ([DAYS_TO_CALL_CATEGORICAL], []), 
                            # 'purpose_class_top_values': ([PURPOSE_CLASS_TOP_VALUES], []), 
                            # 'muni_security_type_top_values': ([MUNI_SECURITY_TYPE_TOP_VALUES], []), 
                            # '100k': ([], [quantity_greater_than_100k]), 
                            # '1m': ([], [quantity_greater_than_1m]), 
                            # 'dd': ([], [trade_type_is_interdealer]), 
                            # 'rating': (['rating'], []), 
                            # 'purpose_class': (['purpose_class'], []), 
                            # 'coupon_categorical_sudhar': ([COUPON_CATEGORICAL_SUDHAR], [])
                            }

Add related trades to the trade data.

In [None]:
%%time
df_encoded = encode_with_label_encoders(trade_data_flattened_trade_history, features_to_exclude=['trade_type']) if ENCODE_REFERENCE_FEATURES else trade_data_flattened_trade_history

In [None]:
trade_data_flattened_trade_history_and_related_trades = dict()
# trade_data_flattened_trade_history = None    # uncomment this line when running LightGBM experiments for data files already created in order to reduce memory overhead

In [None]:
%%time
for name, (categories_to_match, filtering_conditions) in tqdm(related_trades_criterion.items()):
    filename = f'trade_data_flattened_trade_history_and_related_trades_{name}'
    filepath = make_data_filename(filename)
    if os.path.exists(filepath):    # check if a file exists https://www.pythontutorial.net/python-basics/python-check-if-file-exists/
        print(f'Loading dataset for {name} from pickle file {filepath}')
        trade_data_flattened_trade_history_and_related_trades[name] = pd.read_pickle(filepath)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
    elif name not in trade_data_flattened_trade_history_and_related_trades:
        print(f'Creating dataset for {name} and saving it to {filepath}')
        trade_data_flattened_trade_history_and_related_trades[name] = append_recent_trade_data(trade_data_flattened_trade_history, 
                                                                                               NUM_TRADES_IN_RELATED_TRADE_HISTORY, 
                                                                                               RELATED_TRADE_FEATURE_FUNCTIONS_AND_DEFAULT_VALUES, 
                                                                                               feature_prefix=related_trade_feature_prefix, 
                                                                                               categories=categories_to_match, 
                                                                                               filtering_conditions=filtering_conditions, 
                                                                                               return_df=True, 
                                                                                               multiprocessing=True, 
                                                                                               df_for_related_trades=df_encoded).drop(columns=quantized_features)    # drop the quantized features from the final dataframe
        trade_data_flattened_trade_history_and_related_trades[name].to_pickle(filepath, protocol=4)    # protocol 4 allows for use in the VM: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html

In [None]:
trade_data_flattened_trade_history = trade_data_flattened_trade_history.drop(columns=quantized_features)    # drop the quantized features from the final dataframe

Make sure the each group has a reasonable amount of trades (otherwise finding related trades will be too difficult for certain trades). Note that if a group has a count of 1, then that trade has no previous related trades according to this definition of *related*. Further, note that if a group has a count of 2, then only one of those trades has a single past related trade, and the other one doesn't, where the one with no previous related trade is the oldest trade in this group. We should loosen or tighten the definition of *related* in order to make sure almost all trades have at least one previous related trades.

In [None]:
FEATURE_TO_DETECT_NO_PAST_TRADES = 'seconds_ago'    # arbitrarily chosen, but needs to be a feature that does not naturally have occurrences of its default value
for name, df in trade_data_flattened_trade_history_and_related_trades.items():
    print(f'{name}')
    for past_trade_idx in (0, 1, 15, 31):    # range(2):
        if past_trade_idx < NUM_TRADES_IN_RELATED_TRADE_HISTORY:
            feature_name = get_appended_feature_name(past_trade_idx, FEATURE_TO_DETECT_NO_PAST_TRADES, related_trade_feature_prefix)
            num_trades = (df[feature_name] == DEFAULT_VALUES[FEATURE_TO_DETECT_NO_PAST_TRADES]).sum()
            print(f'Number of trades with fewer than {past_trade_idx + 1} past related trades: {num_trades}. Percentage of total trades: {round(num_trades / len(trade_data) * 100, 3)} %')

Decode the added reference features, if they are encoded.

In [None]:
%%time
if ENCODE_REFERENCE_FEATURES:
    print('Decoding the reference features.')
    for df in tqdm(trade_data_flattened_trade_history_and_related_trades.values()):
        for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD:
            encoder = label_encoders[feature]
            for past_trade_idx in range(NUM_TRADES_IN_RELATED_TRADE_HISTORY):
                feature_name = get_appended_feature_name(past_trade_idx, feature, related_trade_feature_prefix)
                df[feature_name] = encoder.inverse_transform(df[feature_name].to_numpy(dtype=int))    # inverse transform the encoded categorical feature column; must set to dtype=int since label encoder encodes to integers
                df[feature_name] = df[feature_name].astype('category')    # change dtype to `categorical` to use in LightGBM model

Make sure that reference features have dtype categorical. 

In [None]:
for df in tqdm(trade_data_flattened_trade_history_and_related_trades.values()):
    for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD:
        if df[feature].dtype.name != 'category': df[feature] = df[feature].astype('category')    # check dtype of a column: https://stackoverflow.com/questions/26924904/check-if-dataframe-column-is-categorical

## Choosing a related trades criteria

Choosing a value for number of trades in the trade history for the same CUSIP that is large enough to capture the predictive power of increasing the number of past trades for the same CUSIP, while also being small enough to make experiments fast.

In [None]:
NUM_TRADES_IN_TRADE_HISTORY_OPT = min(NUM_TRADES_IN_TRADE_HISTORY, 16)
print(f'NUM_TRADES_IN_TRADE_HISTORY_OPT: {NUM_TRADES_IN_TRADE_HISTORY_OPT}')

Choosing a value for number of trades in the related trade history that is large enough to capture the predictive power of increasing the number of past related trades, while also being small enough to make experiments fast.

In [None]:
NUM_TRADES_IN_RELATED_TRADE_HISTORY_OPT = min(NUM_TRADES_IN_RELATED_TRADE_HISTORY, 32)
print(f'NUM_TRADES_IN_RELATED_TRADE_HISTORY_OPT: {NUM_TRADES_IN_RELATED_TRADE_HISTORY_OPT}')

In [None]:
past_trades_columns_opt, all_categorical_features_in_trade_history = get_past_trade_columns(NUM_TRADES_IN_TRADE_HISTORY_OPT, 
                                                                                            FEATURES_IN_HISTORY, 
                                                                                            SAME_CUSIP_PREFIX, 
                                                                                            trade_type_actual=True, 
                                                                                            trade_type_column=TRADE_TYPE_NEW_COLUMN, 
                                                                                            categorical_features_per_trade=CATEGORICAL_FEATURES_IN_HISTORY)

past_related_trades_columns_opt, all_categorical_features_in_trade_history_related = get_past_trade_columns(NUM_TRADES_IN_RELATED_TRADE_HISTORY_OPT, 
                                                                                                            FEATURES_IN_HISTORY, 
                                                                                                            related_trade_feature_prefix, 
                                                                                                            trade_type_actual=True, 
                                                                                                            trade_type_column=TRADE_TYPE_NEW_COLUMN, 
                                                                                                            categorical_features_per_trade=CATEGORICAL_FEATURES_IN_HISTORY + CATEGORICAL_REFERENCE_FEATURES_TO_ADD)

In [None]:
related_trades_criterion_losses = dict()
related_trades_criterion_losses_filepath = make_data_filename('related_trades_criterion_losses')
if os.path.exists(related_trades_criterion_losses_filepath):
    print(f'Loading losses from {related_trades_criterion_losses_filepath}')
    with open(related_trades_criterion_losses_filepath, 'rb') as pickle_handle: related_trades_criterion_losses = pickle.load(pickle_handle)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object
    print(f'Already have loss results for: {list(related_trades_criterion_losses.keys())}')

Create column sets for different purposes.

Difference between `related_trades_features` and `past_related_trades_columns_opt` is that the later has `related_last_trade_type` instead of `related_last_trade_type1` and `related_last_trade_type2`.

Difference between `past_trade_feature_groups_flattened` and `past_trades_columns_opt` is that the former has all the `<num>last_trade_type1` and `<num>last_trade_type2`.

In [None]:
target_trade_features = list(set(PREDICTORS_WITHOUT_LAST_TRADE_FEATURES + CATEGORICAL_REFERENCE_FEATURES_TO_ADD)) + TARGET

columns_to_select_to_create_dataframe = target_trade_features + past_trade_feature_groups_flattened + related_trades_features + DATA_PROCESSING_FEATURES
assert len(columns_to_select_to_create_dataframe) == len(set(columns_to_select_to_create_dataframe))    # checks that there are no intersection between the groups of features
columns_to_select_for_lightgbm_model = target_trade_features + past_trades_columns_opt + past_related_trades_columns_opt
assert len(columns_to_select_for_lightgbm_model) == len(set(columns_to_select_for_lightgbm_model))    # checks that there are no intersection between the groups of features

target_trade_categorical_features = list(set(CATEGORICAL_FEATURES + CATEGORICAL_REFERENCE_FEATURES_TO_ADD))

categorical_features_for_lightgbm_model = target_trade_categorical_features + all_categorical_features_in_trade_history + all_categorical_features_in_trade_history_related
assert len(categorical_features_for_lightgbm_model) == len(set(categorical_features_for_lightgbm_model))    # checks that there are no intersection between the groups of features

print(f'Features used for LightGBM model: {columns_to_select_for_lightgbm_model}')
print(f'Categorical features used for LightGBM model: {categorical_features_for_lightgbm_model}')

In [None]:
for name, df in tqdm(trade_data_flattened_trade_history_and_related_trades.items()):
    if name not in related_trades_criterion_losses:
        # convert trade_type1 and trade_type2 to trade_type with S, P, D for same CUSIP trades
        trade_data_predictors_history_related_trades_actual_trade_type, _, _ = convert_trade_type_encoding_to_actual(df[columns_to_select_to_create_dataframe], 
                                                                                                                     NUM_TRADES_IN_TRADE_HISTORY, 
                                                                                                                     TRADE_TYPE_NEW_COLUMN, 
                                                                                                                     SAME_CUSIP_PREFIX)
        # convert trade_type1 and trade_type2 to trade_type with S, P, D for related trades
        trade_data_predictors_history_related_trades_actual_trade_type, _, _ = convert_trade_type_encoding_to_actual(trade_data_predictors_history_related_trades_actual_trade_type, 
                                                                                                                     NUM_TRADES_IN_RELATED_TRADE_HISTORY, 
                                                                                                                     TRADE_TYPE_NEW_COLUMN, 
                                                                                                                     related_trade_feature_prefix)
        
        train_data_predictors_history_related_trades_actual_trade_type, \
            test_data_predictors_history_related_trades_actual_trade_type = get_train_test_data_trade_datetime(trade_data_predictors_history_related_trades_actual_trade_type, DATE_TO_SPLIT)
        del trade_data_predictors_history_related_trades_actual_trade_type
        gc.collect()
        assert len(train_data_predictors_history_related_trades_actual_trade_type) != 0 and len(test_data_predictors_history_related_trades_actual_trade_type) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'

        print(f'Training the LightGBM model for {name}')
        _, lgb_losses = train_lightgbm_model(train_data_predictors_history_related_trades_actual_trade_type[columns_to_select_for_lightgbm_model], 
                                             test_data_predictors_history_related_trades_actual_trade_type[columns_to_select_for_lightgbm_model], 
                                             categorical_features_for_lightgbm_model, 
                                             wandb_project='mitas_trade_history')
        related_trades_criterion_losses[name] = lgb_losses['Train'][0], lgb_losses['Test'][0]
        with open(related_trades_criterion_losses_filepath, 'wb') as pickle_handle: pickle.dump(related_trades_criterion_losses, pickle_handle, protocol=4)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object


Choose the best definition for related trades, where *best* refers to the definition with the lowest test error.

In [None]:
for name, (train_loss, test_loss) in related_trades_criterion_losses.items():
    print(f'{name}\t\tTrain error: {train_loss}\tTest error: {test_loss}')
related_trades_criterion_ascending_order_of_test_loss = sorted(related_trades_criterion_losses, key=lambda name: related_trades_criterion_losses.get(name)[1])    # sort by minimum test error (which is represented by index 1)
related_trades_criterion_opt = related_trades_criterion_ascending_order_of_test_loss[0]    # optimal name is the one with the minimum test error

From here, we will only have one dataset; the one with the appended trades coming from `related_trades_criterion_opt`.

In [None]:
trade_data_flattened_trade_history_and_related_trades = trade_data_flattened_trade_history_and_related_trades[related_trades_criterion_opt]
trade_data_predictors_history_related_trades = trade_data_flattened_trade_history_and_related_trades[columns_to_select_to_create_dataframe]
train_data_predictors_history_related_trades, test_data_predictors_history_related_trades = get_train_test_data_trade_datetime(trade_data_predictors_history_related_trades, DATE_TO_SPLIT)
train_data_predictors_history_related_trades = train_data_predictors_history_related_trades.drop(columns=DATA_PROCESSING_FEATURES)
test_data_predictors_history_related_trades = test_data_predictors_history_related_trades.drop(columns=DATA_PROCESSING_FEATURES)

trade_data_predictors_history_related_trades_actual_trade_type, old_trade_type_columns, _ = convert_trade_type_encoding_to_actual(trade_data_predictors_history_related_trades, 
                                                                                                                                  NUM_TRADES_IN_TRADE_HISTORY, 
                                                                                                                                  TRADE_TYPE_NEW_COLUMN, 
                                                                                                                                  SAME_CUSIP_PREFIX)
trade_data_predictors_history_related_trades_actual_trade_type, old_trade_type_columns_related, _ = convert_trade_type_encoding_to_actual(trade_data_predictors_history_related_trades_actual_trade_type, 
                                                                                                                                          NUM_TRADES_IN_RELATED_TRADE_HISTORY, 
                                                                                                                                          TRADE_TYPE_NEW_COLUMN, 
                                                                                                                                          related_trade_feature_prefix)

trade_data_predictors_history_related_trades_actual_trade_type = trade_data_predictors_history_related_trades_actual_trade_type.drop(columns=old_trade_type_columns + old_trade_type_columns_related)
train_data_predictors_history_related_trades_actual_trade_type, \
    test_data_predictors_history_related_trades_actual_trade_type = get_train_test_data_trade_datetime(trade_data_predictors_history_related_trades_actual_trade_type, DATE_TO_SPLIT)

train_data_predictors_history_related_trades_actual_trade_type = train_data_predictors_history_related_trades_actual_trade_type.drop(columns=DATA_PROCESSING_FEATURES)
test_data_predictors_history_related_trades_actual_trade_type = test_data_predictors_history_related_trades_actual_trade_type.drop(columns=DATA_PROCESSING_FEATURES)

trade_data_predictors_history_related_trades = trade_data_predictors_history_related_trades.drop(columns=DATA_PROCESSING_FEATURES)
trade_data_predictors_history_related_trades_actual_trade_type = trade_data_predictors_history_related_trades_actual_trade_type.drop(columns=DATA_PROCESSING_FEATURES)

## Determining which trades benefit most from a single past related trade

Train the LightGBM model.

In [None]:
train_data_lightgbm = train_data_predictors_history_related_trades_actual_trade_type[columns_to_select_for_lightgbm_model]
test_data_lightgbm = test_data_predictors_history_related_trades_actual_trade_type[columns_to_select_for_lightgbm_model]

In [None]:
lgb_model, lgb_losses = train_lightgbm_model(train_data_lightbgm, 
                                             test_data_lightgbm, 
                                             categorical_features_for_lightgbm_model, 
                                             wandb_project='mitas_trade_history')

Train the LightGBM model without the related trades information.

In [None]:
test_data_lightgbm_wo_related_trades = test_data_lightgbm.drop(columns=past_related_trades_columns_opt)

In [None]:
lgb_model_wo_related_trades, lgb_wo_related_trades_losses = train_lightgbm_model(train_data_lightgbm.drop(columns=past_related_trades_columns_opt), 
                                                                                 test_data_lightgbm_wo_related_trades, 
                                                                                 categorical_features_for_lightgbm_model, 
                                                                                 wandb_project='mitas_trade_history')

Create function to test different conditions on the data.

In [None]:
def compare_model_with_and_wo_related_trade_info(condition):
    print(f'Number of trades in test data with condition: {condition.sum()} ({round(condition.sum() / len(test_data_lightgbm) * 100, 3)} %)')
    losses = get_all_losses_for_single_dataset(lgb_model, test_data_lightgbm[condition], verbose=False)    # 7 days * 24 hours / day * 60 mins / hour * 60 sec / min
    print(f'MAE when using related trade info: {losses[0]}')
    losses_wo_related_trades = get_all_losses_for_single_dataset(lgb_model_wo_related_trades, test_data_lightgbm_wo_related_trades[condition], verbose=False)    # 7 days * 24 hours / day * 60 mins / hour * 60 sec / min
    print(f'MAE when not using related trade info: {losses_wo_related_trades[0]}')

Get accuracy on portion of test dataset where the most recent same CUSIP trade was more than one week ago.

In [None]:
one_week_ago = test_data_lightgbm['last_seconds_ago'] > 7 * 24 * 60 * 60

In [None]:
compare_model_with_and_wo_related_trade_info(one_week_ago)

In [None]:
compare_model_with_and_wo_related_trade_info(~one_week_ago)

Get accuracy on portion of test dataset where the rating of the trade is lower than BBB+.

In [None]:
low_rating = ~test_data_lightgbm['rating'].isin(['AAA', 'AA+', 'AA', 'AA-', 'A+', 'A', 'A-', 'BBB+'])

In [None]:
compare_model_with_and_wo_related_trade_info(low_rating)

In [None]:
compare_model_with_and_wo_related_trade_info(~low_rating)

## Varying the number of past related trades that the model uses

In [None]:
def train_lightgbm_model_history_and_reference_and_related_trades(num_past_related_trades):
    past_related_trades_columns, all_categorical_features_in_trade_history_related = get_past_trade_columns(num_past_related_trades, 
                                                                                                            FEATURES_IN_HISTORY, 
                                                                                                            related_trade_feature_prefix, 
                                                                                                            trade_type_actual=True, 
                                                                                                            trade_type_column=TRADE_TYPE_NEW_COLUMN, 
                                                                                                            categorical_features_per_trade=CATEGORICAL_FEATURES_IN_HISTORY + CATEGORICAL_REFERENCE_FEATURES_TO_ADD)
    columns_to_select = target_trade_features + past_trades_columns_opt + past_related_trades_columns
    assert len(columns_to_select) == len(set(target_trade_features + past_trades_columns_opt + past_related_trades_columns))    # checks that there are no intersection between the groups of features
    train_data_predictors_history_related_trades_actual_trade_type = 
    test_data_predictors_history_related_trades_actual_trade_type = 
    return train_lightgbm_model(train_data_predictors_history_related_trades_actual_trade_type[columns_to_select], 
                                test_data_predictors_history_related_trades_actual_trade_type[columns_to_select], 
                                target_trade_categorical_features + all_categorical_features_in_trade_history + all_categorical_features_in_trade_history_related, 
                                wandb_project='mitas_trade_history')

In [None]:
num_past_related_trades_in_history_candidates = list(range(NUM_TRADES_IN_RELATED_TRADE_HISTORY + 1))
train_l1_losses_related, test_l1_losses_related = [], []
related_lgb_models = []
for num_past_trades in num_past_related_trades_in_history_candidates:
    lgb_model, lgb_losses = train_lightgbm_model_history_and_reference_and_related_trades(num_past_trades)
    if num_past_trades == 32: lgb_model_opt_32, lgb_model_opt_32_losses = lgb_model, lgb_losses
    related_lgb_models.append(lgb_model)
    train_l1_losses_related.append(lgb_losses['Train'][0])    # index 0 indicates l1 loss
    test_l1_losses_related.append(lgb_losses['Test'][0])    # index 0 indicates l1 loss

In [None]:
if 32 <= NUM_TRADES_IN_RELATED_TRADE_HISTORY:
    if 'lgb_model_opt_32' not in locals(): lgb_model_opt_32, lgb_model_opt_32_losses = train_lightgbm_model_history_and_reference_and_related_trades(32)    # if model not created, then train the lightgbm model
    print(f'Train error: {lgb_model_opt_32_losses["Train"][0]}\tTest error: {lgb_model_opt_32_losses["Test"][0]}')
    lgb.plot_importance(lgb_model_opt_32, figsize=(20, 25), importance_type='gain')

In [None]:
print(f'Train error for 0 trades: {train_l1_losses_related[0]}')
print(f'Minimum value: {min(train_l1_losses_related)} for number of trades: {num_past_related_trades_in_history_candidates[np.argmin(train_l1_losses_related)]}')
plt.ylabel('Train L1 losses')
plt.xlabel('Number of related trades in history')
plt.plot(num_past_related_trades_in_history_candidates, train_l1_losses_related, 'o')

In [None]:
print(f'Test error for 0 trades: {test_l1_losses_related[0]}')
print(f'Minimum value: {min(test_l1_losses_related)} for number of trades: {num_past_related_trades_in_history_candidates[np.argmin(test_l1_losses_related)]}')
plt.ylabel('Test L1 losses')
plt.xlabel('Number of related trades in history')
plt.plot(num_past_related_trades_in_history_candidates, test_l1_losses_related, 'o')

Split the universe into two categories: (1) those with the most recent past related trade occurring on the same day, and (2) those with no past related trades occurring on the same day. 

**Hypothesis**: target trades where the earliest past related trade occurs on the same day have the lowest test error with just a single past related trade, whereas target trades where the earliest past related trade occurs on the previous (or further back) day have decreasing test error as the number of past related trades are increased.

**Conclusion**: the hypothesis could not be verified since there are so few trades in the second group; worth revisiting this experiment at another time if the definition of *related* changes to be something more restrictive.

In [None]:
indices_last_related_trade_same_day = np.where(trade_data_flattened_trade_history_and_related_trades[related_trade_feature_prefix + 'same_day'] == 1.0)[0]    # get True values from pd.Series: https://stackoverflow.com/questions/52173161/getting-a-list-of-indices-where-pandas-boolean-series-is-true
indices_last_related_trade_same_day = trade_data_flattened_trade_history_and_related_trades.iloc[indices_last_related_trade_same_day].index
indices_last_related_trade_diff_day = np.where(trade_data_flattened_trade_history_and_related_trades[related_trade_feature_prefix + 'same_day'] == 0.0)[0]    # get True values from pd.Series: https://stackoverflow.com/questions/52173161/getting-a-list-of-indices-where-pandas-boolean-series-is-true
indices_last_related_trade_diff_day = trade_data_flattened_trade_history_and_related_trades.iloc[indices_last_related_trade_diff_day].index

In [None]:
test_indices = test_data_predictors_history_related_trades_actual_trade_type.index
test_data_predictors_history_related_trades_actual_trade_type_last_related_trade_same_day = test_data_predictors_history_related_trades_actual_trade_type.loc[test_indices.intersection(indices_last_related_trade_same_day)]    # index intersection: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Index.intersection.html
test_data_predictors_history_related_trades_actual_trade_type_last_related_trade_diff_day = test_data_predictors_history_related_trades_actual_trade_type.loc[test_indices.intersection(indices_last_related_trade_diff_day)]    # index intersection: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Index.intersection.html
print(f'Number of trades in test dataset. Same day: {len(test_data_predictors_history_related_trades_actual_trade_type_last_related_trade_same_day)}\tDifferent day: {len(test_data_predictors_history_related_trades_actual_trade_type_last_related_trade_diff_day)}')

In [None]:
def get_last_related_trade_losses(dataset):
    losses = []
    for num_past_trades, model in tqdm(enumerate(related_lgb_models)):
        past_related_trades_columns, _ = get_past_trade_columns(num_past_trades, 
                                                                FEATURES_IN_HISTORY, 
                                                                related_trade_feature_prefix, 
                                                                trade_type_actual=True, 
                                                                trade_type_column=TRADE_TYPE_NEW_COLUMN)
        columns_to_select = target_trade_features + past_trades_columns_opt + past_related_trades_columns
        losses.append(get_all_losses_for_single_dataset(model, dataset[columns_to_select], verbose=False)[0])
    return losses

In [None]:
test_l1_losses_last_related_trade_same_day = get_last_related_trade_losses(test_data_predictors_history_related_trades_actual_trade_type_last_related_trade_same_day)
test_l1_losses_last_related_trade_diff_day = get_last_related_trade_losses(test_data_predictors_history_related_trades_actual_trade_type_last_related_trade_diff_day)

In [None]:
print(f'Test error for 0 trades: {test_l1_losses_last_related_trade_same_day[0]}')
print(f'Minimum value: {min(test_l1_losses_last_related_trade_same_day)} for number of trades: {num_past_related_trades_in_history_candidates[np.argmin(test_l1_losses_last_related_trade_same_day)]}')
plt.ylabel('Test L1 losses')
plt.xlabel('Number of related trades in history')
plt.plot(num_past_related_trades_in_history_candidates, test_l1_losses_last_related_trade_same_day, 'o')

In [None]:
print(f'Test error for 0 trades: {test_l1_losses_last_related_trade_diff_day[0]}')
print(f'Minimum value: {min(test_l1_losses_last_related_trade_diff_day)} for number of trades: {num_past_related_trades_in_history_candidates[np.argmin(test_l1_losses_last_related_trade_diff_day)]}')
plt.ylabel('Test L1 losses')
plt.xlabel('Number of related trades in history')
plt.plot(num_past_related_trades_in_history_candidates, test_l1_losses_last_related_trade_diff_day, 'o')

# Embedding reference data in the related past trades

In [None]:
train_data_only_reference_encoded = encode_with_label_encoders(train_data_only_reference)
test_data_only_reference_encoded = encode_with_label_encoders(test_data_only_reference)

In [None]:
embeddings_name = f'embeddings_{NUM_EPOCHS}_epochs'

In [None]:
embeddings_arrays = dict()

In [None]:
embeddings_arrays_filepath = make_data_filename('embeddings_arrays')
if os.path.exists(embeddings_arrays_filepath):    # check if a file exists https://www.pythontutorial.net/python-basics/python-check-if-file-exists/
    print(f'Loading embeddings_arrays from {embeddings_arrays_filepath}')
    with open(embeddings_arrays_filepath, 'rb') as pickle_handle: embeddings_arrays = pickle.load(pickle_handle)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object
else:
    for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD:
        if feature not in embeddings_arrays:
            print(f'Creating embeddings for feature: {feature}')
            model, _ = train(NNL1LossEmbeddings(BATCH_SIZE, 
                                                NUM_WORKERS, 
                                                train_data_only_reference_encoded[[feature] + TARGET],    # just rating and labels
                                                test_data_only_reference_encoded[[feature] + TARGET],    # just rating and labels
                                                label_encoders, 
                                                [feature], 
                                                power=EMBEDDINGS_POWER), 
                             NUM_EPOCHS, 
                             model_filename=make_filename(f'{embeddings_name}_{feature}'), 
                             save=False, 
                             print_losses_before_training=False,    # setting this to True may cause the kernel to crash
                             print_losses_after_training=False,    # setting this to True may cause the kernel to crash
                             wandb_logging_name=embeddings_name, 
                             wandb_project='mitas_trade_history')
            embedding = list(model.embeddings)[0]    # get the embedding from the model; since there is only one feature, we select it
            embeddings_arrays[feature] = embedding.weight.detach().numpy()    # embedding is a matrix where each row corresponds to a different possible value; convert the tensor to numpy: https://stackoverflow.com/questions/49768306/pytorch-tensor-to-numpy-array
    with open(embeddings_arrays_filepath, 'wb') as pickle_handle: pickle.dump(embeddings_arrays, pickle_handle, protocol=4)    # protocol 4 allows for use in the VM; use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object
print(f'Keys in embeddings_arrays: {list(embeddings_arrays.keys())}')

# RNN (LSTM)

In [None]:
RECURRENT_ARCHITECTURE = 'lstm'
NUM_RECURRENT_LAYERS = 3
RECURRENT_HIDDEN_SIZE = 64

Create column name for trade history of related trades.

In [None]:
TRADE_HISTORY_RELATED = ['trade_history_related']

In [None]:
rnn_name = f'{RECURRENT_ARCHITECTURE}_{NUM_RECURRENT_LAYERS}_layers_{RECURRENT_HIDDEN_SIZE}_hidden_size_{NUM_EPOCHS}_epochs'

Combine trades from the same CUSIP trade history with the related CUSIP trade history. 

In [None]:
COMBINED_TRADE_HISTORY = ['combined_trade_history']

The function below is used to combine the `TRADE_HISTORY` and `TRADE_HISTORY_RELATED` into a single sequence of trades sorted by trade_datetime.

In [None]:
combine_two_histories_sorted_by_seconds_ago_caller = lambda data: combine_two_histories_sorted_by_seconds_ago(data, TRADE_HISTORY + TRADE_HISTORY_RELATED, COMBINED_TRADE_HISTORY[0], FEATURES_TO_INDEX_IN_HISTORY)

The below functions are training functions which serve as *callers* for (1) when the data has no related trades, (2) when the model uses the same RNN to process both the same cusip past trades and the related past trades as a single sequence, and (3) when the model uses different RNN's to process the same cusip past trades and the related past trades, respectively.

In [None]:
no_related_trades_caller = lambda train_data_both_histories, \
                                  test_data_both_histories: train(RecurrentL1Loss(BATCH_SIZE, 
                                                                                  NUM_WORKERS, 
                                                                                  train_data_both_histories[TRADE_HISTORY + TARGET],    # just trade history from same cusip and labels
                                                                                  test_data_both_histories[TRADE_HISTORY + TARGET],    # just trade history from same cusip and labels
                                                                                  NUM_RECURRENT_LAYERS, 
                                                                                  RECURRENT_HIDDEN_SIZE, 
                                                                                  RECURRENT_ARCHITECTURE), 
                                                                  NUM_EPOCHS, 
                                                                  model_filename=make_filename(rnn_name), 
                                                                  save=False, 
                                                                  print_losses_before_training=False,    # setting this to True may cause the kernel to crash
                                                                  print_losses_after_training=False,    # setting this to True may cause the kernel to crash
                                                                  wandb_logging_name=rnn_name, 
                                                                  wandb_project='mitas_trade_history')

In [None]:
same_rnn_caller = lambda train_data_both_histories, \
                         test_data_both_histories, \
                         print_losses_after_training=False: train(RecurrentL1Loss(BATCH_SIZE, 
                                                                                  NUM_WORKERS, 
                                                                                  combine_two_histories_sorted_by_seconds_ago_caller(train_data_both_histories)[COMBINED_TRADE_HISTORY + TARGET],    # just combined trade history, and labels
                                                                                  combine_two_histories_sorted_by_seconds_ago_caller(test_data_both_histories)[COMBINED_TRADE_HISTORY + TARGET],    # just combined trade history, and labels
                                                                                  NUM_RECURRENT_LAYERS, 
                                                                                  RECURRENT_HIDDEN_SIZE, 
                                                                                  RECURRENT_ARCHITECTURE, 
                                                                                  COMBINED_TRADE_HISTORY), 
                                                                  NUM_EPOCHS, 
                                                                  model_filename=make_filename(rnn_name), 
                                                                  save=False, 
                                                                  print_losses_before_training=False,    # setting this to True may cause the kernel to crash
                                                                  print_losses_after_training=print_losses_after_training,    # setting this to True may cause the kernel to crash
                                                                  wandb_logging_name=rnn_name, 
                                                                  wandb_project='mitas_trade_history')

In [None]:
different_rnn_caller = lambda train_data_both_histories, \
                              test_data_both_histories: train(MultipleRecurrentL1Loss(BATCH_SIZE, 
                                                                                      NUM_WORKERS, 
                                                                                      train_data_both_histories[TRADE_HISTORY + TRADE_HISTORY_RELATED + TARGET], 
                                                                                      test_data_both_histories[TRADE_HISTORY + TRADE_HISTORY_RELATED + TARGET], 
                                                                                      NUM_RECURRENT_LAYERS, 
                                                                                      RECURRENT_HIDDEN_SIZE, 
                                                                                      RECURRENT_ARCHITECTURE, 
                                                                                      TRADE_HISTORY + TRADE_HISTORY_RELATED), 
                                                              NUM_EPOCHS, 
                                                              model_filename=make_filename('multiple_' + rnn_name), 
                                                              save=False, 
                                                              print_losses_before_training=False,    # setting this to True may cause the kernel to crash
                                                              print_losses_after_training=False,    # setting this to True may cause the kernel to crash
                                                              wandb_logging_name=rnn_name, 
                                                              wandb_project='mitas_trade_history')

## Representing reference data in trade history
It is not obvious how we add the reference data to the related history, since the data must be numerical in order to use it in the RNN. More thoughts: https://docs.google.com/document/d/12TqR2Axt1u0J4qECpGcgZZRiY1L6Q0z2jGkKm_gRAoU/edit?usp=sharing.

In [None]:
%%time
no_reference_data_filepath = make_data_filename('no_reference_data')
if os.path.exists(no_reference_data_filepath):    # check if a file exists https://www.pythontutorial.net/python-basics/python-check-if-file-exists/
    print(f'Loading dataset from pickle file {no_reference_data_filepath}')
    no_reference_data = pd.read_pickle(no_reference_data_filepath)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
else:
    no_reference_data = pd.DataFrame(index=trade_data.index)    # preserving the original index https://stackoverflow.com/questions/18176933/create-an-empty-data-frame-with-index-from-another-data-frame
    no_reference_data[TRADE_HISTORY[0]] = trade_data[TRADE_HISTORY[0]]
    no_reference_data[TRADE_HISTORY_RELATED[0]] = feature_group_as_single_feature(trade_data_flattened_trade_history_and_related_trades, related_trades_features_wo_reference_features, NUM_TRADES_IN_RELATED_TRADE_HISTORY)
no_reference_data.to_pickle(no_reference_data_filepath, protocol=4)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html

Reference data represented as encoded features. First, the features are encoded. Second, the reference data is added to same CUSIP trade history. Obviously, this creates a lot of duplicate features, but needs to be done in order to interleave the same CUSIP trade history with the related trades trade history. Finally, the trade history columns are created.

In [None]:
%%time
encoded_reference_data_filepath = make_data_filename('encoded_reference_data')
trade_data_encoded_filepath = make_data_filename('trade_data_encoded')
trade_data_flattened_trade_history_and_related_trades_encoded_filepath = make_data_filename('trade_data_flattened_trade_history_and_related_trades_encoded')
if os.path.exists(encoded_reference_data_filepath):    # check if a file exists https://www.pythontutorial.net/python-basics/python-check-if-file-exists/
    print(f'Loading dataset from pickle file {encoded_reference_data_filepath}')
    encoded_reference_data = pd.read_pickle(encoded_reference_data_filepath)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
    trade_data_encoded = pd.read_pickle(trade_data_encoded_filepath)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
    trade_data_flattened_trade_history_and_related_trades_encoded = pd.read_pickle(trade_data_flattened_trade_history_and_related_trades_encoded_filepath)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
else:
    trade_data_encoded = encode_with_label_encoders(trade_data)

    encoded_reference_data = pd.DataFrame(index=trade_data.index)    # preserving the original index https://stackoverflow.com/questions/18176933/create-an-empty-data-frame-with-index-from-another-data-frame
    encoded_reference_data[TRADE_HISTORY[0]] = add_reference_data_to_trade_history(trade_data_encoded, CATEGORICAL_REFERENCE_FEATURES_TO_ADD, TRADE_HISTORY)

    print('Encoding the reference features')
    trade_data_flattened_trade_history_and_related_trades_encoded = trade_data_flattened_trade_history_and_related_trades.copy()
    for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD:
        encoder = label_encoders[feature]
        for past_trade_idx in range(NUM_TRADES_IN_RELATED_TRADE_HISTORY):
            feature_name = get_appended_feature_name(past_trade_idx, feature, related_trade_feature_prefix)
            trade_data_flattened_trade_history_and_related_trades_encoded[feature_name] = encoder.transform(trade_data_flattened_trade_history_and_related_trades[feature_name])    # transform the categorical feature column to its encoding

    encoded_reference_data[TRADE_HISTORY_RELATED[0]] = feature_group_as_single_feature(trade_data_flattened_trade_history_and_related_trades_encoded, related_trades_features, NUM_TRADES_IN_RELATED_TRADE_HISTORY)
    
trade_data_encoded.to_pickle(trade_data_encoded_filepath, protocol=4)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
trade_data_flattened_trade_history_and_related_trades_encoded.to_pickle(trade_data_flattened_trade_history_and_related_trades_encoded_filepath, protocol=4)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
encoded_reference_data.to_pickle(encoded_reference_data_filepath, protocol=4)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html

Reference data represented as one-hot encoded features.

In [None]:
trade_data_one_hot_encoded, one_hot_encoders = encode_and_get_encoders(trade_data, BINARY, CATEGORICAL_FEATURES, 'one_hot')

In [None]:
one_hot_encoded_reference_data_filepath = make_data_filename('one_hot_encoded_reference_data')
if os.path.exists(one_hot_encoded_reference_data_filepath):    # check if a file exists https://www.pythontutorial.net/python-basics/python-check-if-file-exists/
    print(f'Loading dataset from pickle file {one_hot_encoded_reference_data_filepath}')
    one_hot_encoded_reference_data = pd.read_pickle(one_hot_encoded_reference_data_filepath)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
else:
    one_hot_encoded_reference_data = pd.DataFrame(index=trade_data.index)    # preserving the original index https://stackoverflow.com/questions/18176933/create-an-empty-data-frame-with-index-from-another-data-frame
    one_hot_encoded_reference_data[TRADE_HISTORY[0]] = add_reference_data_to_trade_history(trade_data_one_hot_encoded, CATEGORICAL_REFERENCE_FEATURES_TO_ADD, TRADE_HISTORY)

    print('One hot encoding the reference features')
    trade_data_flattened_trade_history_and_related_trades_one_hot_encoded = trade_data_flattened_trade_history_and_related_trades.copy()
    for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD:
        encoder = one_hot_encoders[feature]
        for past_trade_idx in range(NUM_TRADES_IN_RELATED_TRADE_HISTORY):
            feature_name = get_appended_feature_name(past_trade_idx, feature, related_trade_feature_prefix)
            transformed_values = encoder.transform(trade_data_flattened_trade_history_and_related_trades_one_hot_encoded[feature_name].to_numpy(dtype=str).reshape(-1, 1))    # need to reshape to avoid this error: `ValueError: Expected 2D array, got 1D array instead...Reshape your data either using array.reshape(-1, 1) if your data has a single feature`; casting to `str` to avoid TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
            trade_data_flattened_trade_history_and_related_trades_one_hot_encoded[feature_name] = list(transformed_values)    # transform the categorical feature column to its encoding

    one_hot_encoded_reference_data[TRADE_HISTORY_RELATED[0]] = feature_group_as_single_feature(trade_data_flattened_trade_history_and_related_trades_one_hot_encoded, related_trades_features, NUM_TRADES_IN_RELATED_TRADE_HISTORY, flatten_each_row=True, multiprocessing=True)
one_hot_encoded_reference_data.to_pickle(one_hot_encoded_reference_data_filepath, protocol=4)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html

Reference data represented as embedded features. Note: embedding occurs on encoded feature values.

In [None]:
embed_with_embeddings_arrays = lambda df: embed_with_arrays(df, embeddings_arrays)

In [None]:
embedded_reference_data_filepath = make_data_filename('embedded_reference_data')
if os.path.exists(embedded_reference_data_filepath):    # check if a file exists https://www.pythontutorial.net/python-basics/python-check-if-file-exists/
    print(f'Loading dataset from pickle file {embedded_reference_data_filepath}')
    embedded_reference_data = pd.read_pickle(embedded_reference_data_filepath)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
else:
    trade_data_embedded = embed_with_embeddings_arrays(trade_data_encoded)    # embedding occurs on the encoded feature values

    embedded_reference_data = pd.DataFrame(index=trade_data.index)    # preserving the original index https://stackoverflow.com/questions/18176933/create-an-empty-data-frame-with-index-from-another-data-frame
    embedded_reference_data[TRADE_HISTORY[0]] = add_reference_data_to_trade_history(trade_data_embedded, CATEGORICAL_REFERENCE_FEATURES_TO_ADD, TRADE_HISTORY, True)

    print('Embedding the encoded reference features')
    trade_data_flattened_trade_history_and_related_trades_embedded = trade_data_flattened_trade_history_and_related_trades_encoded.copy()
    for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD:
        embeddings_array = embeddings_arrays[feature]
        for past_trade_idx in range(NUM_TRADES_IN_RELATED_TRADE_HISTORY):
            feature_name = get_appended_feature_name(past_trade_idx, feature, related_trade_feature_prefix)
            trade_data_flattened_trade_history_and_related_trades_embedded[feature_name] = trade_data_flattened_trade_history_and_related_trades_encoded[feature_name].map(list_to_index_dict(embeddings_array))    # `.map(...)` is the fastest way to do this: https://stackoverflow.com/questions/20250771/remap-values-in-pandas-column-with-a-dict-preserve-nans

    embedded_reference_data[TRADE_HISTORY_RELATED[0]] = feature_group_as_single_feature(trade_data_flattened_trade_history_and_related_trades_embedded, related_trades_features, NUM_TRADES_IN_RELATED_TRADE_HISTORY, flatten_each_row=True, multiprocessing=True)
embedded_reference_data.to_pickle(embedded_reference_data_filepath, protocol=4)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html

Determine the best reference data representation.

In [None]:
trade_history_data = {'no_reference_data': no_reference_data, 
                      'encoded_reference_data': encoded_reference_data, 
                    #   'one_hot_encoded_reference_data': one_hot_encoded_reference_data, 
                      'embedded_reference_data': embedded_reference_data
                      }

In [None]:
reference_data_representation_losses = dict()

Choosing a small number of past trades to make experiments fast.

In [None]:
limit_history_8_same_cusip_16_related_caller = lambda trade_data: limit_history_to_k_trades(trade_data, {TRADE_HISTORY[0]: 8, TRADE_HISTORY_RELATED[0]: 16})

In [None]:
%%time
reference_data_representation_losses_filepath = make_data_filename('reference_data_representation_losses')
if os.path.exists(reference_data_representation_losses_filepath):
    print(f'Loading losses form {reference_data_representation_losses_filepath}')
    with open(reference_data_representation_losses_filepath, 'rb') as pickle_handle: reference_data_representation_losses = pickle.load(pickle_handle)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object
else:
    limit_history_8_same_cusip_16_related_caller = lambda trade_data: limit_history_to_k_trades(trade_data, {TRADE_HISTORY[0]: 8, TRADE_HISTORY_RELATED[0]: 16})
    train_indices, test_indices = train_data_only_reference.index, test_data_only_reference.index

    for name, trade_data_both_histories in trade_history_data.items():
        if name not in reference_data_representation_losses:
            trade_data_both_histories[TARGET[0]] = trade_data[TARGET[0]]
            train_data_both_histories, test_data_both_histories = get_train_test_data_index(trade_data_both_histories, train_indices, test_indices)
            assert len(train_data_both_histories) != 0 and len(test_data_both_histories) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'
            train_data_both_histories = reverse_order_of_trade_history(train_data_both_histories, TRADE_HISTORY + TRADE_HISTORY_RELATED)
            test_data_both_histories = reverse_order_of_trade_history(test_data_both_histories, TRADE_HISTORY + TRADE_HISTORY_RELATED)
            _, (_, _, (test_l1_loss, _)) = same_rnn_caller(limit_history_8_same_cusip_16_related_caller(train_data_both_histories), 
                                                        limit_history_8_same_cusip_16_related_caller(test_data_both_histories), 
                                                        True)
            reference_data_representation_losses[name] = test_l1_loss
        with open(reference_data_representation_losses_filepath, 'wb') as pickle_handle: pickle.dump(reference_data_representation_losses, pickle_handle, protocol=4)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object

In [None]:
for name, test_loss in reference_data_representation_losses.items():
    print(f'{name}\t\tTest error: {test_loss}')
reference_data_representation_losses_ascending_order = sorted(reference_data_representation_losses, key=lambda name: related_trades_criterion_losses.get(name))    # sort by minimum test error
reference_data_representation_opt = reference_data_representation_losses_ascending_order[0]    # optimal name is the one with the minimum test error

**Conclusions** 

The following parameter choices were used for the LSTM: NUM_RECURRENT_LAYERS=3, RECURRENT_HIDDEN_SIZE=64

|  | No reference data | Reference data encoded | Reference data embedded |
| --- | --- | --- | --- |
| Train loss | 7.520 | 7.527 | 7.511 |
| Test loss | 8.157 | 8.135 | 8.158 |

- There is almost no difference between the three settings
- The lack of difference is surprising, and may suggest that (1) better feature engineering is required, such as differences between the target trade and the related trade, and (2) overly complex representations of the reference data may be unnecessary

Modify the trade data dataframe to keep only the data from the best representation.

In [None]:
reference_data_representation_opt = 'encoded_reference_data'

In [None]:
trade_data[TRADE_HISTORY + TRADE_HISTORY_RELATED] = trade_history_data[reference_data_representation_opt]

Check that trade history columns have only numerical values.

In [None]:
%%time
for trade_history_column in TRADE_HISTORY + TRADE_HISTORY_RELATED:
    trade_history_column_dtype = np.stack(trade_data[trade_history_column].to_numpy()).dtype    # `np.stack(...)` converts the numpy array from a numpy array of numpy arrays to a single 3d numpy array
    assert np.issubdtype(trade_history_column_dtype, np.number), f'trade history column dtype: {trade_history_column_dtype}'    # asserts that the dtype of the trade history array is a numerical type: https://stackoverflow.com/questions/29518923/numpy-asarray-how-to-check-up-that-its-result-dtype-is-numeric

Reverse the trade history to have it in the correct order to be passed into the RNN.

In [None]:
train_data_reference_and_both_histories, test_data_reference_and_both_histories = get_train_test_data_trade_datetime(trade_data, DATE_TO_SPLIT)
assert len(train_data_reference_and_both_histories) != 0 and len(test_data_reference_and_both_histories) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'
train_data_reference_and_both_histories = reverse_order_of_trade_history(train_data_reference_and_both_histories, TRADE_HISTORY + TRADE_HISTORY_RELATED)
test_data_reference_and_both_histories = reverse_order_of_trade_history(test_data_reference_and_both_histories, TRADE_HISTORY + TRADE_HISTORY_RELATED)
train_data_reference_and_both_histories = train_data_reference_and_both_histories.drop(columns=DATA_PROCESSING_FEATURES + IDENTIFIERS)
test_data_reference_and_both_histories = test_data_reference_and_both_histories.drop(columns=DATA_PROCESSING_FEATURES + IDENTIFIERS)
train_data_both_histories = train_data_reference_and_both_histories[TRADE_HISTORY + TRADE_HISTORY_RELATED + TARGET]
test_data_both_histories = test_data_reference_and_both_histories[TRADE_HISTORY + TRADE_HISTORY_RELATED + TARGET]

## 5 previous trades same CUSIP (baseline)

### Without `settlement_date_to_calc_date` and `quantity_diff` features in `trade_history`
This is the original baseline that we are working to beat.

In [None]:
train_data_both_histories_samecusip5 = limit_history_to_k_trades(train_data_both_histories, {TRADE_HISTORY[0]: 5})
test_data_both_histories_samecusip5 = limit_history_to_k_trades(test_data_both_histories, {TRADE_HISTORY[0]: 5})

In [None]:
remove_new_features_from_trade_history_caller = lambda data, trade_history_columns: remove_feature_from_trade_history(data, trade_history_columns, ['settlement_date_to_calc_date', 'quantity_diff'], FEATURES_TO_INDEX_IN_HISTORY)

In [None]:
%%time
no_related_trades_caller(remove_new_features_from_trade_history_caller(train_data_both_histories_samecusip5, TRADE_HISTORY), 
                         remove_new_features_from_trade_history_caller(test_data_both_histories_samecusip5, TRADE_HISTORY))

### With `settlement_date_to_calc_date` and `quantity_diff` features in `trade_history`

In [None]:
%%time
no_related_trades_caller(train_data_both_histories_samecusip5, test_data_both_histories_samecusip5)

## 16 previous trades same CUSIP

In [None]:
train_data_both_histories_samecusip16 = limit_history_to_k_trades(train_data_both_histories, {TRADE_HISTORY[0]: 16})
test_data_both_histories_samecusip16 = limit_history_to_k_trades(test_data_both_histories, {TRADE_HISTORY[0]: 16})

### No related trades

In [None]:
%%time
no_related_trades_caller(train_data_both_histories_samecusip16, test_data_both_histories_samecusip16)

### 32 related trades (same RNN)

In [None]:
train_data_both_histories_samecusip16_related32 = limit_history_to_k_trades(train_data_both_histories_samecusip16, {TRADE_HISTORY_RELATED[0]: 32})
test_data_both_histories_samecusip16_related32 = limit_history_to_k_trades(test_data_both_histories_samecusip16, {TRADE_HISTORY_RELATED[0]: 32})

In [None]:
%%time
same_rnn_caller(train_data_both_histories_samecusip16_related32, test_data_both_histories_samecusip16_related32)

### 32 related trades (different RNN's)

In [None]:
%%time
different_rnn_caller(train_data_both_histories_samecusip16_related32, test_data_both_histories_samecusip16_related32)

### 64 related trades (same RNN)

In [None]:
%%time
# assumes that the number of related trades is 64 by default
same_rnn_caller(train_data_both_histories_samecusip16, test_data_both_histories_samecusip16)

### 64 related trades (different RNN's)

In [None]:
%%time
# assumes that the number of related trades is 64 by default
different_rnn_caller(train_data_both_histories_samecusip16, test_data_both_histories_samecusip16)

## 32 previous trades same CUSIP

### No related trades

In [None]:
%%time
# assumes that the number of same cusip past trades is 32 by default
no_related_trades_caller(train_data_both_histories, test_data_both_histories)

### 32 related trades (same RNN)

In [None]:
train_data_both_histories_related32 = limit_history_to_k_trades(train_data_both_histories, {TRADE_HISTORY_RELATED[0]: 32})
test_data_both_histories_related32 = limit_history_to_k_trades(test_data_both_histories, {TRADE_HISTORY_RELATED[0]: 32})

In [None]:
%%time
# assumes that the number of same cusip past trades is 32 by default
same_rnn_caller(train_data_both_histories_related32, test_data_both_histories_related32)

### 32 related trades (different RNN)

In [None]:
%%time
# assumes that the number of same cusip past trades is 32 by default
different_rnn_caller(train_data_both_histories_related32, test_data_both_histories_related32)

### 64 related trades (same RNN)

In [None]:
%%time
# assumes that the number of same cusip past trades is 32 by default and the number of related trades is 64 by default
same_rnn_caller(train_data_both_histories, test_data_both_histories)

### 64 related trades (different RNN)

In [None]:
%%time
# assumes that the number of same cusip past trades is 32 by default and the number of related trades is 64 by default
different_rnn_caller(train_data_both_histories, test_data_both_histories)

The below table shows the results of different experiments above, where each cell has two values: (1) training loss, (2) test loss.

|  | No related trades | 32 related trades (same RNN) | 64 related trades (same RNN) | 32 related trades (different RNN) | 64 related trades (different RNN) |
| --- | --- | --- | --- | --- | --- |
| **5 same CUSIP trades** | 7.78582, 8.76154 | TODO | TODO | TODO | TODO |
| **16 same CUSIP trades** | 7.30065, 8.65277 | 7.44454, 8.49455 | 7.62937, 8.48275 | 7.13232, 8.65048 | 7.3365, 8.64571 |
| **32 same CUSIP trades** | 7.18217, 8.55768 | 6.92592, 8.40626 | 7.4078, 8.47732 | 7.14426, 8.56969 | 6.77395, 8.57151 |

**Conclusions**
- increasing the number of same CUSIP trades increases accuracy (on average about 0.1 bps improvement)
- same RNN performs better than different RNN (on average about 0.15 bps improvement); the higher test error, but lower training error implies that the different RNN architecture is overfitting
- difference between 32 related trades and 64 related trades is negligible (all differences within 0.07 bps)

From here on, we use `NUM_TRADES_IN_TRADE_HISTORY_OPT` and `NUM_TRADES_IN_RELATED_TRADE_HISTORY_OPT` to see the impact of having longer histories, but also to make experiments fast.

In [None]:
limit_history_to_opt_trades_caller = lambda data: limit_history_to_k_trades(data, {TRADE_HISTORY[0]: NUM_TRADES_IN_TRADE_HISTORY_OPT, TRADE_HISTORY_RELATED[0]: NUM_TRADES_IN_RELATED_TRADE_HISTORY})

# Different padding schemes

In [None]:
processed_file_nonzero_padding_pickle = '../../../../ficc/ml_models/sequence_predictors/data/processed_data_ficc_ycl_long_history_nonzero_padding_2021-12-31-23-59.pkl'
processed_file_nonzero_padding_pickle_datestring = get_datestring_from_filename(processed_file_nonzero_padding_pickle)
assert processed_file_pickle_datestring == processed_file_nonzero_padding_pickle_datestring, 'Different files are being used'
trade_data_nonzero_padding = read_processed_file_pickle(processed_file_nonzero_padding_pickle)

Apply exclusions (identical to first few cells of notebook).

In [None]:
trade_data_nonzero_padding = trade_data_nonzero_padding[(trade_data_nonzero_padding.days_to_call == 0) | (trade_data_nonzero_padding.days_to_call > np.log10(400))]
trade_data_nonzero_padding = trade_data_nonzero_padding[(trade_data_nonzero_padding.days_to_refund == 0) | (trade_data_nonzero_padding.days_to_refund > np.log10(400))]
trade_data_nonzero_padding = trade_data_nonzero_padding[trade_data_nonzero_padding.days_to_maturity < np.log10(30000)]
trade_data_nonzero_padding = trade_data_nonzero_padding[trade_data_nonzero_padding.sinking == False]
trade_data_nonzero_padding = trade_data_nonzero_padding[trade_data_nonzero_padding.incorporated_state_code != 'VI']
trade_data_nonzero_padding = trade_data_nonzero_padding[trade_data_nonzero_padding.incorporated_state_code != 'GU']
trade_data_nonzero_padding = trade_data_nonzero_padding[(trade_data_nonzero_padding.coupon_type == 8)]
trade_data_nonzero_padding = trade_data_nonzero_padding[trade_data_nonzero_padding.is_called == False]

# restructured bonds and high chance of default bonds are removed
trade_data_nonzero_padding = remove_rows_with_feature_value(trade_data_nonzero_padding, 'purpose_sub_class', [6, 20, 21, 22, 44, 57, 90, 106])
# pre-refunded bonds and partially refunded bonds are removed
trade_data_nonzero_padding = remove_rows_with_feature_value(trade_data_nonzero_padding, 'called_redemption_type', [18, 19])


trade_data_nonzero_padding = replace_rating_with_standalone_rating(trade_data_nonzero_padding)


ADDITIONAL_CATEGORICAL_FEATURES = check_additional_features(trade_data_nonzero_padding, ADDITIONAL_CATEGORICAL_FEATURES)

all_features_set = set(trade_data_nonzero_padding.columns)
BINARY = list(set(BINARY).intersection(all_features_set))
CATEGORICAL_FEATURES = list((set(CATEGORICAL_FEATURES) | set(ADDITIONAL_CATEGORICAL_FEATURES)).intersection(all_features_set))
NON_CAT_FEATURES = list(set(NON_CAT_FEATURES).intersection(all_features_set))

trade_data_nonzero_padding = trade_data_nonzero_padding[IDENTIFIERS + 
                        BINARY + 
                        CATEGORICAL_FEATURES + 
                        NON_CAT_FEATURES + 
                        DATA_PROCESSING_FEATURES + 
                        TRADE_HISTORY + 
                        TARGET]

trade_data_nonzero_padding, _ = replace_nan_for_features(trade_data_nonzero_padding, FEATURES_AND_NAN_REPLACEMENT_VALUES, verbose=True)
trade_data_nonzero_padding = remove_fields_with_single_unique_value(trade_data_nonzero_padding)

all_features_set = set(trade_data_nonzero_padding.columns)
BINARY = list(set(BINARY).intersection(all_features_set))
CATEGORICAL_FEATURES = list(set(CATEGORICAL_FEATURES).intersection(all_features_set))
NON_CAT_FEATURES = list(set(NON_CAT_FEATURES).intersection(all_features_set))
PREDICTORS = BINARY + CATEGORICAL_FEATURES + NON_CAT_FEATURES

trade_data_nonzero_padding = remove_rows_with_nan_value(trade_data_nonzero_padding)


# sort by trade_datetime since order can be changed when reading pickle file into m1 since it loads by chunks
trade_data_nonzero_padding = trade_data_nonzero_padding.sort_values(by='trade_datetime', ascending=False)

Append related trades.

In [None]:
%%time 
RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED = 'rating_without_+-_b_nr_combined'
trade_data_nonzero_padding[RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED] = trade_data_nonzero_padding['rating'].transform(lambda rating: str.rstrip(rating, '+-'))    # remove + and - from right side of string
# group BBB, BB, B, and NR together since each have a very small number of trades
b_ratings = trade_data_nonzero_padding[RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED].isin(['B', 'BB', 'BBB', 'NR'])
trade_data_nonzero_padding.loc[b_ratings, RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED] = 'B'
print(f'Created {RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED} feature')

DAYS_TO_MATURITY_CATEGORICAL = 'days_to_maturity_categorical'
trade_data_nonzero_padding[DAYS_TO_MATURITY_CATEGORICAL] = pd.cut(trade_data_nonzero_padding['days_to_maturity'], num_of_days_bins_maturity).astype('string')
print(f'Created {DAYS_TO_MATURITY_CATEGORICAL} feature')

DAYS_TO_CALL_CATEGORICAL = 'days_to_call_categorical'
trade_data_nonzero_padding[DAYS_TO_CALL_CATEGORICAL] = pd.cut(trade_data_nonzero_padding['days_to_call'], num_of_days_bins_call).astype('string')
print(f'Created {DAYS_TO_CALL_CATEGORICAL} feature')

COUPON_CATEGORICAL = 'coupon_categorical'
coupon_bins = [0, 3, 4, 4.5, 5.0 + epsilon, VERY_LARGE_NUMBER]   # 0 - 2.99, 3 - 3.99, 4 - 4.49, 4.5 - 5; from discussion with a team member
trade_data_nonzero_padding[COUPON_CATEGORICAL] = pd.cut(trade_data_nonzero_padding['coupon'], coupon_bins, right=False).astype('string')
print(f'Created {COUPON_CATEGORICAL} feature')

COUPON_CATEGORICAL_SUDHAR = 'coupon_categorical_sudhar'
coupon_bins = [0, 3, 4, 4.5, 5, 5.25, 5.5, 6, VERY_LARGE_NUMBER]    # from Sudhar's paper: Kolm, Purushothaman. 2021. Systematic Pricing and Trading of Municipal Bonds
trade_data_nonzero_padding[COUPON_CATEGORICAL_SUDHAR] = pd.cut(trade_data_nonzero_padding['coupon'], coupon_bins, right=False).astype('string')
print(f'Created {COUPON_CATEGORICAL_SUDHAR} feature')

# COUPON_TOP_VALUES = 'coupon_top_values'
# trade_data_nonzero_padding[COUPON_TOP_VALUES] = trade_data_nonzero_padding['coupon']
# top4_coupon_values = trade_data_nonzero_padding['coupon'].value_counts().head(4).index.tolist()    # select the top 4 coupon values based on frequency in the data, which are: 5.0, 4.0, 3.0, 2.0 comprising about 90% of the data
# trade_data_nonzero_padding.loc[~trade_data_nonzero_padding['coupon'].isin(top4_coupon_values), COUPON_TOP_VALUES] = -1    # arbitrary numerical value that is invalid as a coupon value
# print(f'Created {COUPON_TOP_VALUES} feature')

PURPOSE_CLASS_TOP_VALUES = 'purpose_class_top_values'
trade_data_nonzero_padding[PURPOSE_CLASS_TOP_VALUES] = trade_data_nonzero_padding['purpose_class']
trade_data_nonzero_padding.loc[~trade_data_nonzero_padding['purpose_class'].isin(top6_purpose_class_values), PURPOSE_CLASS_TOP_VALUES] = -1    # arbitrary numerical value that is invalid as a purpose_class value
print(f'Created {PURPOSE_CLASS_TOP_VALUES} feature')

MUNI_SECURITY_TYPE_TOP_VALUES = 'muni_security_type_top_values'
trade_data_nonzero_padding[MUNI_SECURITY_TYPE_TOP_VALUES] = trade_data_nonzero_padding['muni_security_type']
trade_data_nonzero_padding.loc[~trade_data_nonzero_padding['muni_security_type'].isin(top6_muni_security_type_values), MUNI_SECURITY_TYPE_TOP_VALUES] = -1    # arbitrary numerical value that is invalid as a purpose_class value
print(f'Created {MUNI_SECURITY_TYPE_TOP_VALUES} feature')

TRADE_DATETIME_DAY = 'trade_datetime_day'
trade_data_nonzero_padding[TRADE_DATETIME_DAY] = trade_data_nonzero_padding['trade_datetime'].transform(lambda datetime: datetime.date()).astype('string')    # remove timestamp from datetime
print(f'Created {TRADE_DATETIME_DAY} feature')

QUANTITY_CATEGORICAL = 'quantity_categorical'
trade_data_nonzero_padding[QUANTITY_CATEGORICAL] = pd.cut(trade_data_nonzero_padding['quantity'], quantity_bins).astype('string')
print(f'Created {QUANTITY_CATEGORICAL} feature')

In [None]:
# format for this dictionary: key is the name of the feature; value is a tuple where the first item is a function of the current trade and related trade, and the second item is the default value to be filled in if that value does not exist
RELATED_TRADE_FEATURE_FUNCTIONS_AND_NONZERO_PADDING_DEFAULT_VALUES = {key: (function, DEFAULT_VALUES_NONZERO_PADDING[key]) for key, function in RELATED_TRADE_FEATURE_FUNCTIONS.items()}

In [None]:
%%time
categories_to_match_opt, filtering_conditions_opt = related_trades_criterion[related_trades_criterion_opt]
df_encoded = encode_with_label_encoders(trade_data_flattened_trade_history, features_to_exclude=['trade_type']) if ENCODE_REFERENCE_FEATURES else trade_data_flattened_trade_history
filepath = make_data_filename('trade_data_nonzero_padding_flattened_trade_history_and_related_trades')
if os.path.exists(filepath):    # check if a file exists https://www.pythontutorial.net/python-basics/python-check-if-file-exists/
    print(f'Loading dataset from pickle file {filepath}')
    trade_data_nonzero_padding_related_trades = pd.read_pickle(filepath)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
else:
    print(f'Creating dataset and saving it to {filepath}')
    trade_data_nonzero_padding_related_trades = append_recent_trade_data(trade_data_nonzero_padding, 
                                                                         NUM_TRADES_IN_RELATED_TRADE_HISTORY, 
                                                                         RELATED_TRADE_FEATURE_FUNCTIONS_AND_NONZERO_PADDING_DEFAULT_VALUES, 
                                                                         feature_prefix=related_trade_feature_prefix, 
                                                                         categories=categories_to_match_opt, 
                                                                         filtering_conditions=filtering_conditions_opt, 
                                                                         return_df=True, 
                                                                         multiprocessing=True, 
                                                                         df_for_related_trades=df_encoded).drop(columns=quantized_features)    # drop the quantized features from the final dataframe
    trade_data_nonzero_padding_related_trades.to_pickle(filepath, protocol=4)    # protocol 4 allows for use in the VM: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
trade_data_nonzero_padding = trade_data_nonzero_padding.drop(columns=quantized_features)    # drop the quantized features from the final dataframe

Prep dataframe for training same RNN model.

In [None]:
trade_data_nonzero_padding[TRADE_HISTORY[0]] = add_reference_data_to_trade_history(encode_with_label_encoders(trade_data_nonzero_padding), 
                                                                                   CATEGORICAL_REFERENCE_FEATURES_TO_ADD, 
                                                                                   TRADE_HISTORY)

In [None]:
trade_data_nonzero_padding[TRADE_HISTORY_RELATED[0]] = feature_group_as_single_feature(trade_data_nonzero_padding_related_trades, 
                                                                                       related_trades_features, 
                                                                                       NUM_TRADES_IN_RELATED_TRADE_HISTORY)

train_data_nonzero_padding_reference_and_both_histories, \
    test_data_nonzero_padding_reference_and_both_histories = get_train_test_data_trade_datetime(trade_data_nonzero_padding, DATE_TO_SPLIT)
assert len(train_data_nonzero_padding_reference_and_both_histories) != 0 and len(test_data_nonzero_padding_reference_and_both_histories) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'
train_data_nonzero_padding_reference_and_both_histories = reverse_order_of_trade_history(train_data_nonzero_padding_reference_and_both_histories, 
                                                                                         TRADE_HISTORY + TRADE_HISTORY_RELATED)
test_data_nonzero_padding_reference_and_both_histories = reverse_order_of_trade_history(test_data_nonzero_padding_reference_and_both_histories, 
                                                                                        TRADE_HISTORY + TRADE_HISTORY_RELATED)
train_data_nonzero_padding_reference_and_both_histories = train_data_nonzero_padding_reference_and_both_histories.drop(columns=DATA_PROCESSING_FEATURES + IDENTIFIERS)
test_data_nonzero_padding_reference_and_both_histories = test_data_nonzero_padding_reference_and_both_histories.drop(columns=DATA_PROCESSING_FEATURES + IDENTIFIERS)
train_data_nonzero_padding_both_histories = train_data_nonzero_padding_reference_and_both_histories[TRADE_HISTORY + TRADE_HISTORY_RELATED + TARGET]
test_data_nonzero_padding_both_histories = test_data_nonzero_padding_reference_and_both_histories[TRADE_HISTORY + TRADE_HISTORY_RELATED + TARGET]

Train same RNN model over different seeds to reduce the effect of randomness.

In [None]:
NUM_SEEDS = 5
losses_dict_filepath = make_data_filename('padding_losses_dict')
losses_dict = {'zero_padding': [], 'nonzero_padding': []}

if os.path.exists(losses_dict_filepath):
    print(f'Loading losses form {losses_dict_filepath}')
    with open(losses_dict_filepath, 'rb') as pickle_handle: losses_dict = pickle.load(pickle_handle)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object

test_l1_losses, test_l1_losses_nonzero_padding = losses_dict['zero_padding'], losses_dict['nonzero_padding']
assert len(test_l1_losses) == len(test_l1_losses_nonzero_padding)

if len(test_l1_losses) < NUM_SEEDS :
    train_data_both_histories_8_16 = limit_history_8_same_cusip_16_related_caller(train_data_both_histories)
    test_data_both_histories_8_16 = limit_history_8_same_cusip_16_related_caller(test_data_both_histories)
    train_data_nonzero_padding_both_histories_8_16 = limit_history_8_same_cusip_16_related_caller(train_data_nonzero_padding_both_histories)
    test_data_nonzero_padding_both_histories_8_16 = limit_history_8_same_cusip_16_related_caller(test_data_nonzero_padding_both_histories)
    for seed in range(len(test_l1_losses), NUM_SEEDS):
        seed_everything(seed, workers=True)

        _, test_l1_loss = same_rnn_caller(train_data_both_histories_8_16, test_data_both_histories_8_16)
        test_l1_losses.append(test_l1_loss.item())    # get value from a single element tensor: https://stackoverflow.com/questions/57727372/how-do-i-get-the-value-of-a-tensor-in-pytorch

        _, test_l1_loss = same_rnn_caller(train_data_nonzero_padding_both_histories_8_16, test_data_nonzero_padding_both_histories_8_16)
        test_l1_losses_nonzero_padding.append(test_l1_loss.item())    # get value from a single element tensor: https://stackoverflow.com/questions/57727372/how-do-i-get-the-value-of-a-tensor-in-pytorch
        with open(losses_dict_filepath, 'wb') as pickle_handle: pickle.dump(losses_dict, pickle_handle, protocol=4)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object

In [None]:
print(f'Average loss: {sum(test_l1_losses) / NUM_SEEDS}\t\tAll losses: {test_l1_losses}')
print(f'Average loss (zero padding): {sum(test_l1_losses_nonzero_padding) / NUM_SEEDS}\t\tAll losses: {test_l1_losses_nonzero_padding}')

Reset seed back to default `SEED` value.

In [None]:
seed_everything(SEED, workers=True)

**Conclusions** 

To make experiments faster, the following parameter choices were used for the LSTM: NUM_RECURRENT_LAYERS=2, RECURRENT_HIDDEN_SIZE=16

|  | Seed 0 | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Average |
| --- | --- | --- | --- | --- | --- | --- |
| Zero padding | 8.63817024230957 | 8.734763145446777 | 8.767729759216309 | 8.626927375793457 | 8.581864356994629 | 8.669890975952148 |
| Nonzero padding | 8.457047462463379 | 8.560364723205566 | 8.584809303283691 | 8.534771919250488 | 8.473348617553711 | 8.522068405151368 |

- Nonzero padding outperforms zero padding by 0.15 basis points on average

# Feedforward Network for Reference Data with RNN's for Trade History

Try different models while incorporating the reference data:
- Model (a): feedforward network with just reference data
- Model (b): feedforward network with reference data and RNN on same cusip trade history
- Model (c): feedforward network with reference data and RNN on same cusip trade history and reference data for last related trade
- Model (d): feedforward network with reference data and reference data for last related trade and interleaved RNN on same cusip and related trade histories
- Model (e): feedforward network with reference data and reference data for last related trade and different RNN on same cusip and related trade histories

In [None]:
%%time
train_data_reference_and_both_histories = limit_history_to_opt_trades_caller(train_data_reference_and_both_histories)
test_data_reference_and_both_histories = limit_history_to_opt_trades_caller(test_data_reference_and_both_histories)

In [None]:
%%time
train_data_reference_and_both_histories_encoded = encode_with_label_encoders(train_data_reference_and_both_histories)
test_data_reference_and_both_histories_encoded = encode_with_label_encoders(test_data_reference_and_both_histories)

In [None]:
feedforward_losses = dict()
feedforward_losses_filepath = make_data_filename('feedforward_losses')
if os.path.exists(feedforward_losses_filepath):    # check if a file exists https://www.pythontutorial.net/python-basics/python-check-if-file-exists/
    print(f'Loading results so far from pickle file {feedforward_losses_filepath}')
    with open(feedforward_losses_filepath, 'rb') as pickle_handle: feedforward_losses = pickle.load(pickle_handle)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object

In [None]:
def train_and_store_model_loss(model, experiment_name):
    model_name = experiment_name + '_' + nn_name
    _, test_loss = train(model, 
                         NUM_EPOCHS, 
                         model_filename=make_filename(model_name), 
                         save=True, 
                         print_losses_before_training=False,    # setting this to True may cause the kernel to crash
                         print_losses_after_training=False,    # setting this to True may cause the kernel to crash
                         wandb_logging_name=model_name)
    feedforward_losses[experiment_name] = test_loss
    with open(feedforward_losses_filepath, 'wb') as pickle_handle: pickle.dump(feedforward_losses, pickle_handle, protocol=4)    # protocol 4 allows for use in the VM; use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object

In [None]:
%%time
experiment_name = 'reference_data'
if experiment_name not in feedforward_losses:
    model = NNL1LossEmbeddings(BATCH_SIZE, 
                               NUM_WORKERS, 
                               train_data_only_reference_encoded, 
                               test_data_only_reference_encoded, 
                               label_encoders, 
                               CATEGORICAL_FEATURES, 
                               NUM_NODES_HIDDEN_LAYER, 
                               NUM_HIDDEN_LAYERS, 
                               power=EMBEDDINGS_POWER)
    train_and_store_model_loss(model, experiment_name)

In [None]:
%%time
experiment_name = 'reference_data_same_cusip_rnn'
if experiment_name not in feedforward_losses:
    model = NNL1LossEmbeddingsWithRecurrence(BATCH_SIZE, 
                                             NUM_WORKERS, 
                                             train_data_with_trade_history_encoded, 
                                             test_data_with_trade_history_encoded, 
                                             label_encoders, 
                                             CATEGORICAL_FEATURES, 
                                             NUM_NODES_HIDDEN_LAYER, 
                                             NUM_HIDDEN_LAYERS, 
                                             NUM_RECURRENT_LAYERS, 
                                             RECURRENT_HIDDEN_SIZE, 
                                             recurrent_architecture=RECURRENT_ARCHITECTURE, 
                                             power=EMBEDDINGS_POWER)
    train_and_store_model_loss(model, experiment_name)

## Using a single past related trade

In [None]:
%%time
train_data_reference_both_histories_single_related_trade_encoded_filepath = make_data_filename('train_data_reference_both_histories_single_related_trade_encoded')
test_data_reference_both_histories_single_related_trade_encoded_filepath = make_data_filename('test_data_reference_both_histories_single_related_trade_encoded')
train_data_reference_same_cusip_history_single_related_trade_encoded_filepath = make_data_filename('train_data_reference_same_cusip_history_single_related_trade_encoded')
test_data_reference_same_cusip_history_single_related_trade_encoded_filepath = make_data_filename('test_data_reference_same_cusip_history_single_related_trade_encoded')
if os.path.exists(test_data_reference_same_cusip_history_single_related_trade_encoded_filepath):
    print(f'Loading dataset from pickle file {test_data_reference_same_cusip_history_single_related_trade_encoded_filepath}')
    train_data_reference_both_histories_single_related_trade_encoded = pd.read_pickle(train_data_reference_both_histories_single_related_trade_encoded_filepath)
    test_data_reference_both_histories_single_related_trade_encoded = pd.read_pickle(test_data_reference_both_histories_single_related_trade_encoded_filepath)
    train_data_reference_same_cusip_history_single_related_trade_encoded = pd.read_pickle(train_data_reference_same_cusip_history_single_related_trade_encoded_filepath)
    test_data_reference_same_cusip_history_single_related_trade_encoded = pd.read_pickle(test_data_reference_same_cusip_history_single_related_trade_encoded_filepath)
else:
    train_data_reference_both_histories_single_related_trade_encoded = add_single_trade_from_history_as_reference_features(train_data_reference_and_both_histories_encoded, 
                                                                                                                           TRADE_HISTORY_RELATED, 
                                                                                                                           FEATURES_IN_HISTORY, 
                                                                                                                           prefix=related_trade_feature_prefix, 
                                                                                                                           datetime_ascending=True)
    train_data_reference_same_cusip_history_single_related_trade_encoded = train_data_reference_both_histories_single_related_trade_encoded.drop(columns=TRADE_HISTORY_RELATED[0])
    test_data_reference_both_histories_single_related_trade_encoded = add_single_trade_from_history_as_reference_features(test_data_reference_and_both_histories_encoded, 
                                                                                                                          TRADE_HISTORY_RELATED, 
                                                                                                                          FEATURES_IN_HISTORY, 
                                                                                                                          prefix=related_trade_feature_prefix, 
                                                                                                                          datetime_ascending=True)
    test_data_reference_same_cusip_history_single_related_trade_encoded = test_data_reference_both_histories_single_related_trade_encoded.drop(columns=TRADE_HISTORY_RELATED[0])

    train_data_reference_both_histories_single_related_trade_encoded.to_pickle(train_data_reference_both_histories_single_related_trade_encoded_filepath, protocol=4)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
    test_data_reference_both_histories_single_related_trade_encoded.to_pickle(test_data_reference_both_histories_single_related_trade_encoded_filepath, protocol=4)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
    train_data_reference_same_cusip_history_single_related_trade_encoded.to_pickle(train_data_reference_same_cusip_history_single_related_trade_encoded_filepath, protocol=4)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
    test_data_reference_same_cusip_history_single_related_trade_encoded.to_pickle(test_data_reference_same_cusip_history_single_related_trade_encoded_filepath, protocol=4)

In [None]:
label_encoders_for_single_related_trade = dict()
categorical_features_for_single_related_trade = []
for new_categorical_feature, original_categorical_feature in zip(get_past_trade_columns(1, CATEGORICAL_REFERENCE_FEATURES_TO_ADD, related_trade_feature_prefix)[0], CATEGORICAL_REFERENCE_FEATURES_TO_ADD):    # select index 0 to get just the column names
    label_encoders_for_single_related_trade[new_categorical_feature] = label_encoders[original_categorical_feature]
    categorical_features_for_single_related_trade.append(new_categorical_feature)
label_encoders_and_label_encoders_for_single_related_trade = label_encoders | label_encoders_for_single_related_trade    # combine two dictionaries together for Python v3.9+: https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression

In [None]:
%%time
experiment_name = 'reference_data_same_cusip_rnn_single_related_trade'
if experiment_name not in feedforward_losses:
    model = NNL1LossEmbeddingsWithRecurrence(BATCH_SIZE, 
                                             NUM_WORKERS, 
                                             train_data_reference_same_cusip_history_single_related_trade_encoded, 
                                             test_data_reference_same_cusip_history_single_related_trade_encoded, 
                                             label_encoders_and_label_encoders_for_single_related_trade, 
                                             CATEGORICAL_FEATURES + categorical_features_for_single_related_trade, 
                                             NUM_NODES_HIDDEN_LAYER, 
                                             NUM_HIDDEN_LAYERS, 
                                             NUM_RECURRENT_LAYERS, 
                                             RECURRENT_HIDDEN_SIZE, 
                                             recurrent_architecture=RECURRENT_ARCHITECTURE, 
                                             power=EMBEDDINGS_POWER)
    train_and_store_model_loss(model, experiment_name)

In [None]:
%%time
experiment_name = 'reference_data_single_related_trade_both_histories_interleaved_rnn'
if experiment_name not in feedforward_losses:
    model = NNL1LossEmbeddingsWithRecurrence(BATCH_SIZE, 
                                             NUM_WORKERS, 
                                             combine_two_histories_sorted_by_seconds_ago_caller(train_data_reference_both_histories_single_related_trade_encoded).drop(columns=TRADE_HISTORY + TRADE_HISTORY_RELATED), 
                                             combine_two_histories_sorted_by_seconds_ago_caller(test_data_reference_both_histories_single_related_trade_encoded).drop(columns=TRADE_HISTORY + TRADE_HISTORY_RELATED), 
                                             label_encoders_and_label_encoders_for_single_related_trade, 
                                             CATEGORICAL_FEATURES + categorical_features_for_single_related_trade, 
                                             NUM_NODES_HIDDEN_LAYER, 
                                             NUM_HIDDEN_LAYERS, 
                                             NUM_RECURRENT_LAYERS, 
                                             RECURRENT_HIDDEN_SIZE, 
                                             recurrent_architecture=RECURRENT_ARCHITECTURE, 
                                             trade_history_column=COMBINED_TRADE_HISTORY, 
                                             power=EMBEDDINGS_POWER)
    train_and_store_model_loss(model, experiment_name)

In [None]:
%%time
experiment_name = 'reference_data_single_related_trade_both_histories_different_rnn'
if experiment_name not in feedforward_losses:
    model = NNL1LossEmbeddingsWithMultipleRecurrence(BATCH_SIZE, 
                                                     NUM_WORKERS, 
                                                     train_data_reference_both_histories_single_related_trade_encoded, 
                                                     test_data_reference_both_histories_single_related_trade_encoded, 
                                                     label_encoders_and_label_encoders_for_single_related_trade, 
                                                     CATEGORICAL_FEATURES + categorical_features_for_single_related_trade, 
                                                     NUM_NODES_HIDDEN_LAYER, 
                                                     NUM_HIDDEN_LAYERS, 
                                                     NUM_RECURRENT_LAYERS, 
                                                     RECURRENT_HIDDEN_SIZE, 
                                                     recurrent_architecture=RECURRENT_ARCHITECTURE, 
                                                     trade_history_columns=TRADE_HISTORY + TRADE_HISTORY_RELATED, 
                                                     power=EMBEDDINGS_POWER)
    train_and_store_model_loss(model, experiment_name)

In [None]:
for name, test_loss in feedforward_losses.items():
    print(f'{name}\t\tTest error: {test_loss}')
feedforward_losses_ascending_order = sorted(feedforward_losses, key=lambda name: feedforward_losses.get(name))    # sort by minimum test error
opt = feedforward_losses_ascending_order[0]    # optimal name is the one with the minimum test error

**Conclusions** 

The following parameter choices were used for the LSTM: NUM_RECURRENT_LAYERS=3, RECURRENT_HIDDEN_SIZE=64

|  | Model (a) | Model (b) | Model (c) | Model (d) | Model (e) |
| --- | --- | --- | --- | --- | --- |
| Train loss | 7.469 | 6.055 | 5.550 | 6.100 | 5.217 |
| Test loss | 8.791 | 7.576 | 7.650 | 7.838 | 7.598 |

- Model (b) is giving the best test accuracy followed closely by model (e); this is particularly disappointing since the greater capacity of models (c), (d), and (e) are not leading to greater test loss
- Model (e) has the lowest train loss meaning that it has the greatest capacity (as expected)