In [1]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict
import os
import gc
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from datetime import datetime

import lightgbm as lgb

from ficc.utils.auxiliary_variables import VERY_LARGE_NUMBER, \
                                           IDENTIFIERS, \
                                           CATEGORICAL_FEATURES, \
                                           NON_CAT_FEATURES, \
                                           BINARY, \
                                           TRADE_HISTORY, \
                                           NUM_OF_DAYS_IN_YEAR
from ficc.utils.auxiliary_functions import flatten
from ficc.utils.diff_in_days import diff_in_days_two_dates
from ficc.utils.trade_dict_to_list import FEATURES_IN_HISTORY, \
                                          FEATURES_TO_INDEX_IN_HISTORY, \
                                          CATEGORICAL_FEATURES_IN_HISTORY, \
                                          quantity_diff
from ficc.utils.trade_dict_to_list_mappings import TRADE_TYPE_MAPPING, \
                                                   TRADE_TYPE_CROSS_PRODUCT_MAPPING, \
                                                   RATING_TO_INT_MAPPING
from ficc.utils.related_trade import append_recent_trade_data, \
                                     get_appended_feature_name

import sys
sys.path.insert(0,'../')

from trade_history_model_mitas.data_prep import get_past_trade_columns, \
                                                convert_trade_type_encoding_to_actual, \
                                                is_sorted

from yield_spread_model_mitas.data_prep import FEATURES_AND_NAN_REPLACEMENT_VALUES, \
                                               ADDITIONAL_CATEGORICAL_FEATURES, \
                                               get_datestring_from_filename, \
                                               remove_rows_with_feature_value, \
                                               replace_rating_with_standalone_rating, \
                                               add_past_trades_info, \
                                               check_additional_features, \
                                               replace_nan_for_features, \
                                               encode_and_get_encoders, \
                                               encode_with_encoders
from yield_spread_model_mitas.models import single_feature_model
from yield_spread_model_mitas.train import get_train_test_data_trade_datetime
from yield_spread_model_mitas.tree_models import train_lightgbm_model, \
                                                 get_predictions_for_single_dataset, \
                                                 convert_columns_with_dtype_object_to_category

from rating_model_mitas.data_prep import read_processed_file_pickle, \
                                         remove_fields_with_single_unique_value, \
                                         remove_rows_with_nan_value

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Use zero padding.

In [2]:
DEFAULT_VALUES_ZERO_PADDING = defaultdict(int)

DEFAULT_VALUES = DEFAULT_VALUES_ZERO_PADDING

In [3]:
NUM_TRADES_IN_TRADE_HISTORY = 1

In [4]:
TARGET = ['yield_spread']

In [5]:
DATA_PROCESSING_FEATURES = ['trade_datetime',    # used to split the data into training and test sets
                            'settlement_date',    # used (in conjunction with calc_date) to create the settlement_date_to_calc_date feature in past trades
                            'calc_date',    # used (in conjunction with settlement_date) to create the settlement_date_to_calc_date feature in past trades
                            'calc_day_cat',    # added in the past trades
                            # 'coupon_type'    # used to group related trades; currently commented out since there is only a single value of 8 present in the data
                           ]

In [6]:
%%time
processed_file_pickle = '../../../../ficc/ml_models/sequence_predictors/data/processed_data_ficc_ycl_long_history_2022-10-08-00-00.pkl'
processed_file_pickle_datestring = get_datestring_from_filename(processed_file_pickle)
trade_data = read_processed_file_pickle(processed_file_pickle)

START: Reading from processed file at ../../../../ficc/ml_models/sequence_predictors/data/processed_data_ficc_ycl_long_history_2022-10-08-00-00.pkl
END: Reading from processed file at ../../../../ficc/ml_models/sequence_predictors/data/processed_data_ficc_ycl_long_history_2022-10-08-00-00.pkl
CPU times: user 7.65 s, sys: 8.48 s, total: 16.1 s
Wall time: 18.6 s


Keep all trades before October 8, to standardize with Charles and Developer.

In [None]:
trade_data = trade_data[trade_data.trade_datetime < datetime(2022, 10, 8)]

Apply exclusions.

In [None]:
print(f'Total number of trades: {len(trade_data)}')

In [None]:
%%time
trade_data = trade_data[(trade_data.days_to_call == 0) | (trade_data.days_to_call > np.log10(400))]
trade_data = trade_data[(trade_data.days_to_refund == 0) | (trade_data.days_to_refund > np.log10(400))]
trade_data = trade_data[trade_data.days_to_maturity < np.log10(30000)]
trade_data = trade_data[trade_data.sinking == False]
trade_data = trade_data[trade_data.incorporated_state_code != 'VI']
trade_data = trade_data[trade_data.incorporated_state_code != 'GU']
# trade_data = trade_data[(trade_data.coupon_type == 8)]
# trade_data = trade_data[trade_data.is_called == False]

# restructured bonds and high chance of default bonds are removed
trade_data = remove_rows_with_feature_value(trade_data, 'purpose_sub_class', [6, 20, 22, 44, 57, 90])
# pre-refunded bonds and partially refunded bonds are removed
trade_data = remove_rows_with_feature_value(trade_data, 'called_redemption_type', [18, 19])

In [None]:
trade_data = replace_rating_with_standalone_rating(trade_data)

Add `treasury_spread` to the `NON_CAT_FEATURES`.

In [None]:
NON_CAT_FEATURES.append('ficc_treasury_spread')

In [None]:
ADDITIONAL_CATEGORICAL_FEATURES = check_additional_features(trade_data, ADDITIONAL_CATEGORICAL_FEATURES)

trade_data, _ = replace_nan_for_features(trade_data, FEATURES_AND_NAN_REPLACEMENT_VALUES, verbose=True)
trade_data = remove_fields_with_single_unique_value(trade_data, BINARY + CATEGORICAL_FEATURES + ADDITIONAL_CATEGORICAL_FEATURES + NON_CAT_FEATURES)

all_features_set = set(trade_data.columns)
BINARY = list(set(BINARY) & all_features_set)
CATEGORICAL_FEATURES = list((set(CATEGORICAL_FEATURES) | set(ADDITIONAL_CATEGORICAL_FEATURES)) & all_features_set)
NON_CAT_FEATURES = list(set(NON_CAT_FEATURES) & all_features_set)
PREDICTORS = BINARY + CATEGORICAL_FEATURES + NON_CAT_FEATURES

trade_data = trade_data[IDENTIFIERS + 
                        PREDICTORS + 
                        DATA_PROCESSING_FEATURES + 
                        TRADE_HISTORY + 
                        TARGET]

trade_data = remove_rows_with_nan_value(trade_data)

In [None]:
print(f'Identifiers: {sorted(IDENTIFIERS)}')
print(f'Predictors: {sorted(PREDICTORS)}')
print(f'Binary features: {sorted(BINARY)}')
print(f'Categorical features: {sorted(CATEGORICAL_FEATURES)}')
print(f'Numerical features: {sorted(NON_CAT_FEATURES)}')

Convert `CATEGORICAL_FEATURES` with dtype object to dtype category to be used in the LightGBM model.

In [None]:
trade_data = convert_columns_with_dtype_object_to_category(trade_data, CATEGORICAL_FEATURES)

In [None]:
PREDICTORS_WITHOUT_LAST_TRADE_FEATURES = [predictor for predictor in PREDICTORS if not predictor.startswith('last')]
print(f'The following features are in PREDICTORS but not in PREDICTORS_WITHOUT_LAST_TRADE_FEATURES: {set(PREDICTORS) - set(PREDICTORS_WITHOUT_LAST_TRADE_FEATURES)}')

In [None]:
# Ensure that the dataframe is sorted in descending order by `trade_datetime`
assert is_sorted(trade_data['trade_datetime'], ascending=False)

In [None]:
oldest_trade_datetime = trade_data['trade_datetime'].iloc[-1]
newest_trade_datetime = trade_data['trade_datetime'].iloc[0]

print(f'Oldest trade datetime: {oldest_trade_datetime}.\
    Newest trade datetime: {newest_trade_datetime}.\
    Gap: {newest_trade_datetime - oldest_trade_datetime}')
print(f'Total number of trades: {len(trade_data)}')

Create a dataset with only the reference data.

In [None]:
DATE_TO_SPLIT = datetime(2022, 9, 15)    # September 15 2022

In [None]:
train_data, test_data = get_train_test_data_trade_datetime(trade_data, DATE_TO_SPLIT)
print(f'Number of trades for training: {len(train_data)}.\
    Number of trades for testing: {len(test_data)}')
assert len(train_data) != 0 and len(test_data) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'
train_data_with_trade_history = train_data.drop(columns=DATA_PROCESSING_FEATURES + IDENTIFIERS)
test_data_with_trade_history = test_data.drop(columns=DATA_PROCESSING_FEATURES + IDENTIFIERS)
train_data_only_reference = train_data_with_trade_history.drop(columns=TRADE_HISTORY)
test_data_only_reference = test_data_with_trade_history.drop(columns=TRADE_HISTORY)

Flatten the trade history. The flattened data is used in the LightGBM models.

In [None]:
%%time
trade_data_flattened_trade_history, \
    additional_binary_features_from_past_trades, \
    additional_noncat_features_from_past_trades, \
    past_trade_feature_groups = add_past_trades_info(trade_data, NUM_TRADES_IN_TRADE_HISTORY - 1, FEATURES_TO_INDEX_IN_HISTORY)
past_trade_feature_groups_flattened = flatten(past_trade_feature_groups)
print(f'Each of the past trades are in the following feature groups: {past_trade_feature_groups}')

Decode the trade type from a 2-dimensional binary list to its original value for trades in the history. For example, a trade type of `[0, 1]` would be decoded to `S`.

In [None]:
TRADE_TYPE_NEW_COLUMN = 'trade_type'

In [None]:
SAME_CUSIP_PREFIX = 'last_'

In [None]:
trade_data_history_and_reference_features = trade_data_flattened_trade_history[past_trade_feature_groups_flattened + PREDICTORS_WITHOUT_LAST_TRADE_FEATURES + DATA_PROCESSING_FEATURES + TARGET]

trade_data_history_and_reference_features_actual_trade_type, old_trade_type_columns, _ = convert_trade_type_encoding_to_actual(trade_data_history_and_reference_features, 
                                                                                                                               NUM_TRADES_IN_TRADE_HISTORY, 
                                                                                                                               TRADE_TYPE_NEW_COLUMN, 
                                                                                                                               SAME_CUSIP_PREFIX)
trade_data_history_and_reference_features_actual_trade_type = trade_data_history_and_reference_features_actual_trade_type.drop(columns=old_trade_type_columns)
# check that the above procedure removed just one feature for each past trade (`trade_type1` and `trade_type2` combine to just `trade_type`)
assert len(trade_data_history_and_reference_features_actual_trade_type.columns) == len(trade_data_history_and_reference_features.columns) - NUM_TRADES_IN_TRADE_HISTORY, f'Before converting, the dataframe had {len(trade_data_history_and_reference_features.columns)} columns, and after converting, the dataframe has {len(trade_data_history_and_reference_features_actual_trade_type.columns)} columns'

del trade_data_history_and_reference_features
gc.collect()

train_data_history_and_reference_features_actual_trade_type, \
    test_data_history_and_reference_features_actual_trade_type = get_train_test_data_trade_datetime(trade_data_history_and_reference_features_actual_trade_type, DATE_TO_SPLIT)
del trade_data_history_and_reference_features_actual_trade_type
gc.collect()
assert len(train_data_history_and_reference_features_actual_trade_type) != 0 and len(test_data_history_and_reference_features_actual_trade_type) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'

train_data_history_and_reference_features_actual_trade_type = train_data_history_and_reference_features_actual_trade_type.drop(columns=DATA_PROCESSING_FEATURES)
test_data_history_and_reference_features_actual_trade_type = test_data_history_and_reference_features_actual_trade_type.drop(columns=DATA_PROCESSING_FEATURES)

# Last Yield Spread
Weakest baseline where output it just the yield spread of the most previous trade in the same CUSIP.

In [None]:
single_feature_model(trade_data, 'last_yield_spread')

# LightGBM

In [None]:
past_trade_columns, all_categorical_features_in_trade_history = get_past_trade_columns(NUM_TRADES_IN_TRADE_HISTORY, 
                                                                                       FEATURES_IN_HISTORY, 
                                                                                       SAME_CUSIP_PREFIX, 
                                                                                       trade_type_actual=True, 
                                                                                       trade_type_column=TRADE_TYPE_NEW_COLUMN, 
                                                                                       categorical_features_per_trade=CATEGORICAL_FEATURES_IN_HISTORY)
columns_to_select_for_lightgbm_model = PREDICTORS_WITHOUT_LAST_TRADE_FEATURES + past_trade_columns + TARGET
categorical_features_for_lightgbm_model = CATEGORICAL_FEATURES + all_categorical_features_in_trade_history

assert len(columns_to_select_for_lightgbm_model) == len(set(columns_to_select_for_lightgbm_model))    # checks that there are no intersection between the groups of features
assert len(categorical_features_for_lightgbm_model) == len(set(categorical_features_for_lightgbm_model))    # checks that there are no intersection between the groups of features

print(f'Features used for LightGBM model: {columns_to_select_for_lightgbm_model}')
print(f'Categorical features used for LightGBM model: {categorical_features_for_lightgbm_model}')

In [None]:
lgb_model, lgb_losses = train_lightgbm_model(train_data_history_and_reference_features_actual_trade_type[columns_to_select_for_lightgbm_model], 
                                             test_data_history_and_reference_features_actual_trade_type[columns_to_select_for_lightgbm_model], 
                                             CATEGORICAL_FEATURES + all_categorical_features_in_trade_history, 
                                             wandb_project='mitas_trade_history')

In [None]:
lgb.plot_importance(lgb_model, figsize=(20, 10), importance_type='gain')

Add error of the LightGBM model into the dataset.

In [None]:
def get_errors(lgb_model, dataset: pd.core.frame.DataFrame):
    predictions = get_predictions_for_single_dataset(lgb_model, dataset[columns_to_select_for_lightgbm_model])
    return predictions - dataset[TARGET[0]]

In [None]:
lgb_train_errors = get_errors(lgb_model, train_data_history_and_reference_features_actual_trade_type)
train_data_history_and_reference_features_actual_trade_type['lgb_error'] = lgb_train_errors
lgb_test_errors = get_errors(lgb_model, test_data_history_and_reference_features_actual_trade_type)
test_data_history_and_reference_features_actual_trade_type['lgb_error'] = lgb_test_errors

Add `lgb_error` to `trade_data` so that it persists in `CATEGORICAL_REFERENCE_FEATURES_TO_ADD`, and `trade_data_flattened_trade_history` to use for future experiments.

In [None]:
all_errors = np.append(lgb_test_errors, lgb_train_errors)
trade_data['lgb_error'] = all_errors    # test_error comes before train_error since `trade_data` is sorted in DESCENDING order of `trade_datetime`
trade_data_flattened_trade_history['lgb_error'] = all_errors    # test_error comes before train_error since `trade_data` is sorted in DESCENDING order of `trade_datetime`

Sanity check: make sure the distribution of `lgb_error` is centered tightly at 0.

In [None]:
plt.hist(all_errors, bins='auto')
print(f'Largest positive error: {np.max(all_errors)}')
print(f'Largest negative error: {np.min(all_errors)}')

# Add histories of related trades

In [None]:
make_data_filename = lambda name: f'data/{name}.pkl'    # used to create a filename to save the PyTorch model parameters
if not os.path.isdir('data/'):
    os.mkdir('data/')

Perform encoding of categorical features.

In [None]:
%%time
train_data_only_reference_encoded, label_encoders = encode_and_get_encoders(train_data_only_reference, BINARY, CATEGORICAL_FEATURES)

label_encoders_filepath = make_data_filename('label_encoders')
with open(label_encoders_filepath, 'wb') as pickle_handle: pickle.dump(label_encoders, pickle_handle, protocol=4)    # protocol 4 allows for use in the VM; use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object
encode_with_label_encoders = lambda df, features_to_exclude=[]: encode_with_encoders(df, label_encoders, features_to_exclude)

test_data_only_reference_encoded = encode_with_label_encoders(test_data_only_reference)
train_data_with_trade_history_encoded = encode_with_label_encoders(train_data_with_trade_history)
test_data_with_trade_history_encoded = encode_with_label_encoders(test_data_with_trade_history)

For each trade, we find the `NUM_TRADES_IN_RELATED_TRADE_HISTORY` most recent related trades (up until the `trade_datetime` of the current trade) that are from different CUSIPs.

In [None]:
NUM_TRADES_IN_RELATED_TRADE_HISTORY = 1

Use certain reference data columns to data in order to augment the same CUSIP history. These will be added when selecting the features to use in the LightGBM model.

In [None]:
CATEGORICAL_REFERENCE_FEATURES_TO_ADD = ['rating', 'incorporated_state_code', 'purpose_sub_class']    # choosing a few features from the most important features for the LightGBM model on just reference data along with the error of the LightGBM model
CATEGORICAL_REFERENCE_FEATURES_TO_ADD = list(set(CATEGORICAL_REFERENCE_FEATURES_TO_ADD) & set(trade_data.columns))    # make sure that all CATEGORICAL_REFERENCE_FEATURES_TO_ADD are in the trade data as columns
print(f'Including the following reference features for each related trade and the target trade: {CATEGORICAL_REFERENCE_FEATURES_TO_ADD}')

Add this same reference data to related trade history. However, to make this code faster, we first encode the reference data, and then decode after creating the past trade history. This creates a speedup because `append_recent_trade_data(...)` is significantly faster when working with numerical data as opposed to objects, due to how numpy handles `dtype='O'` versus `dtype='np.float_'`.

In [None]:
ENCODE_REFERENCE_FEATURES = False    # boolean variable that determines whether trade history will contain categorical features that must be encoded before adding these features to the trade history

In [None]:
for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD:
    if feature not in FEATURES_TO_INDEX_IN_HISTORY: FEATURES_TO_INDEX_IN_HISTORY[feature] = len(FEATURES_TO_INDEX_IN_HISTORY)
    ENCODE_REFERENCE_FEATURES = True

In [None]:
related_trade_feature_prefix = 'related_last_'
get_neighbor_feature = lambda feature: lambda curr, neighbor: neighbor[feature]
RELATED_TRADE_FEATURE_FUNCTIONS = {'yield_spread': get_neighbor_feature('yield_spread'), 
                                   'treasury_spread': get_neighbor_feature('ficc_treasury_spread'), 
                                   'quantity': get_neighbor_feature('quantity'), 
                                   'quantity_diff': lambda curr, neighbor: quantity_diff(10 ** neighbor['quantity'] - 10 ** curr['quantity']), 
                                   'trade_type1': lambda curr, neighbor: TRADE_TYPE_MAPPING[neighbor['trade_type']][0], 
                                   'trade_type2': lambda curr, neighbor: TRADE_TYPE_MAPPING[neighbor['trade_type']][1], 
                                   'seconds_ago': lambda curr, neighbor: np.log10(1 + (curr['trade_datetime'] - neighbor['trade_datetime']).total_seconds()), 
                                   'settlement_date_to_calc_date': lambda curr, neighbor: np.log10(1 + diff_in_days_two_dates(neighbor['calc_date'], neighbor['settlement_date'], convention='exact')), 
                                   'calc_day_cat': get_neighbor_feature('calc_day_cat'), 
                                   'trade_type_past_latest': lambda curr, neighbor: TRADE_TYPE_CROSS_PRODUCT_MAPPING[neighbor['trade_type'] + curr['trade_type']], 
                                  #  'rating_diff': lambda curr, neighbor: RATING_TO_INT_MAPPING[curr['rating']] - RATING_TO_INT_MAPPING[neighbor['rating']]
                                   }

additional_related_trade_functions = {'same_day': lambda curr, neighbor: int(neighbor['trade_datetime'].date() == curr['trade_datetime'].date()),    # used to track additional information about the related trades; compare date only instead of entire datetime: https://stackoverflow.com/questions/3743222/how-do-i-convert-a-datetime-to-date
                                      'lgb_error': get_neighbor_feature('lgb_error')}

reference_features_to_add_functions = {feature: get_neighbor_feature(feature) for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD}
RELATED_TRADE_FEATURE_FUNCTIONS = RELATED_TRADE_FEATURE_FUNCTIONS | additional_related_trade_functions | reference_features_to_add_functions    # combine two dictionaries together for Python v3.9+: https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression

print(f'Related trades will have the following features: {RELATED_TRADE_FEATURE_FUNCTIONS.keys()}')

related_trades_features_groups = [[get_appended_feature_name(idx, feature, related_trade_feature_prefix) for feature in RELATED_TRADE_FEATURE_FUNCTIONS] 
                                   for idx in range(NUM_TRADES_IN_RELATED_TRADE_HISTORY)]    # insertion order of the dictionary is preserved for Python v3.7+
related_trades_features = flatten(related_trades_features_groups)

In [None]:
# dictionary format. key: name of the feature; value: two-item tuple where the first item is a function of the current trade and related trade, and the second item is the default value to be filled in if that value does not exist
RELATED_TRADE_FEATURE_FUNCTIONS_AND_DEFAULT_VALUES = {key: (function, DEFAULT_VALUES[key]) for key, function in RELATED_TRADE_FEATURE_FUNCTIONS.items()}

See which features are the in the same CUSIP trade history and the related trade history.

In [None]:
# assert FEATURES_IN_HISTORY == [key for key in RELATED_TRADE_FEATURE_FUNCTIONS if key not in additional_related_trade_functions and key not in reference_features_to_add_functions]    # insertion order of the dictionary is preserved for Python v3.7+ so this will check if the ordering of the keys are the same
print(f'Each trade in the same CUSIP trade history has the following features: {FEATURES_IN_HISTORY}')
print(f'Each trade in the related trade history has the following features: {RELATED_TRADE_FEATURE_FUNCTIONS.keys()}')

Create "quantized features" which groups together certain values of the features when used to make related trades. For example, `RATING_WITHOUT_PLUS_MINUS` removes the + and - from ratings, and so a bond with rating A+ will be related to a bond with rating A or A-.

Purpose class was added as a quantized feature based on a call with Desmond Dahill from Tegus on 09/27/2022.

In [None]:
%%time
epsilon = 1 / VERY_LARGE_NUMBER

RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED = 'rating_without_+-_b_nr_combined'
trade_data_flattened_trade_history[RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED] = trade_data_flattened_trade_history['rating'].transform(lambda rating: str.rstrip(rating, '+-'))    # remove + and - from right side of string
# group BBB, BB, B, and NR together since each have a very small number of trades
b_ratings = trade_data_flattened_trade_history[RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED].isin(['B', 'BB', 'BBB', 'NR'])
trade_data_flattened_trade_history.loc[b_ratings, RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED] = 'B'
print(f'Created {RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED} feature')

DAYS_TO_MATURITY_CATEGORICAL = 'days_to_maturity_categorical'
num_of_days_bins_maturity = [np.log10(days) for days in [epsilon, NUM_OF_DAYS_IN_YEAR * 2, NUM_OF_DAYS_IN_YEAR * 5, NUM_OF_DAYS_IN_YEAR * 10, VERY_LARGE_NUMBER]]    # 2 years, 5 years, 10 years; arbitrarily chosen
trade_data_flattened_trade_history[DAYS_TO_MATURITY_CATEGORICAL] = pd.cut(trade_data_flattened_trade_history['days_to_maturity'], num_of_days_bins_maturity).astype('string')
print(f'Created {DAYS_TO_MATURITY_CATEGORICAL} feature')

DAYS_TO_CALL_CATEGORICAL = 'days_to_call_categorical'
num_of_days_bins_call = [np.log10(days) for days in [epsilon, NUM_OF_DAYS_IN_YEAR * 2, NUM_OF_DAYS_IN_YEAR * 5, VERY_LARGE_NUMBER]]    # 2 years, 5 years; arbitrarily chosen
trade_data_flattened_trade_history[DAYS_TO_CALL_CATEGORICAL] = pd.cut(trade_data_flattened_trade_history['days_to_call'], num_of_days_bins_call).astype('string')
print(f'Created {DAYS_TO_CALL_CATEGORICAL} feature')

COUPON_CATEGORICAL = 'coupon_categorical'
coupon_bins = [0, 3, 4, 4.5, 5.0 + epsilon, VERY_LARGE_NUMBER]   # 0 - 2.99, 3 - 3.99, 4 - 4.49, 4.5 - 5; from discussion with a team member
trade_data_flattened_trade_history[COUPON_CATEGORICAL] = pd.cut(trade_data_flattened_trade_history['coupon'], coupon_bins, right=False).astype('string')
print(f'Created {COUPON_CATEGORICAL} feature')

COUPON_CATEGORICAL_SUDHAR = 'coupon_categorical_sudhar'
coupon_bins = [0, 3, 4, 4.5, 5, 5.25, 5.5, 6, VERY_LARGE_NUMBER]    # from Sudhar's paper: Kolm, Purushothaman. 2021. Systematic Pricing and Trading of Municipal Bonds
trade_data_flattened_trade_history[COUPON_CATEGORICAL_SUDHAR] = pd.cut(trade_data_flattened_trade_history['coupon'], coupon_bins, right=False).astype('string')
print(f'Created {COUPON_CATEGORICAL_SUDHAR} feature')

# COUPON_TOP_VALUES = 'coupon_top_values'
# trade_data_flattened_trade_history[COUPON_TOP_VALUES] = trade_data_flattened_trade_history['coupon']
# top4_coupon_values = trade_data_flattened_trade_history['coupon'].value_counts().head(4).index.tolist()    # select the top 4 coupon values based on frequency in the data, which are: 5.0, 4.0, 3.0, 2.0 comprising about 90% of the data
# trade_data_flattened_trade_history.loc[~trade_data_flattened_trade_history['coupon'].isin(top4_coupon_values), COUPON_TOP_VALUES] = -1    # arbitrary numerical value that is invalid as a coupon value
# print(f'Created {COUPON_TOP_VALUES} feature')

PURPOSE_CLASS_TOP_VALUES = 'purpose_class_top_values'
trade_data_flattened_trade_history[PURPOSE_CLASS_TOP_VALUES] = trade_data_flattened_trade_history['purpose_class']
top6_purpose_class_values = trade_data_flattened_trade_history['purpose_class'].value_counts().head(6).index.tolist()    # select the top 6 coupon values based on frequency in the data, which are: 37 (school district), 51 (various purpose), 50 (utility), 46 (tax revenue), 9 (education), 48 (transportation) comprising about 80% of the data
trade_data_flattened_trade_history.loc[~trade_data_flattened_trade_history['purpose_class'].isin(top6_purpose_class_values), PURPOSE_CLASS_TOP_VALUES] = -1    # arbitrary numerical value that is invalid as a purpose_class value
print(f'Created {PURPOSE_CLASS_TOP_VALUES} feature')

MUNI_SECURITY_TYPE_TOP_VALUES = 'muni_security_type_top_values'
trade_data_flattened_trade_history[MUNI_SECURITY_TYPE_TOP_VALUES] = trade_data_flattened_trade_history['muni_security_type']
top6_muni_security_type_values = trade_data_flattened_trade_history['muni_security_type'].value_counts().head(2).index.tolist()    # select the top 2 coupon values based on frequency in the data, which are: 8 (revenue), 5 (unlimited g.o.) comprising about 80% of the data
trade_data_flattened_trade_history.loc[~trade_data_flattened_trade_history['muni_security_type'].isin(top6_muni_security_type_values), MUNI_SECURITY_TYPE_TOP_VALUES] = -1    # arbitrary numerical value that is invalid as a purpose_class value
print(f'Created {MUNI_SECURITY_TYPE_TOP_VALUES} feature')

TRADE_DATETIME_DAY = 'trade_datetime_day'
trade_data_flattened_trade_history[TRADE_DATETIME_DAY] = trade_data_flattened_trade_history['trade_datetime'].transform(lambda datetime: datetime.date()).astype('string')    # remove timestamp from datetime
print(f'Created {TRADE_DATETIME_DAY} feature')

QUANTITY_CATEGORICAL = 'quantity_categorical'
quantity_bins = [0, 5, 6, 7, VERY_LARGE_NUMBER]    # 0 - 100k, 100k - 1m, 1m - 10m, 10m+
trade_data_flattened_trade_history[QUANTITY_CATEGORICAL] = pd.cut(trade_data_flattened_trade_history['quantity'], quantity_bins).astype('string')
print(f'Created {QUANTITY_CATEGORICAL} feature')

In [None]:
quantized_features = [RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, 
                      DAYS_TO_MATURITY_CATEGORICAL, 
                      DAYS_TO_CALL_CATEGORICAL, 
                      COUPON_CATEGORICAL, 
                      COUPON_CATEGORICAL_SUDHAR, 
                      PURPOSE_CLASS_TOP_VALUES, 
                      MUNI_SECURITY_TYPE_TOP_VALUES, 
                    #   COUPON_TOP_VALUES, 
                      TRADE_DATETIME_DAY, 
                      QUANTITY_CATEGORICAL]

Make sure that each category (for each quantized feature) has a reasonable number of trades.

In [None]:
for feature in quantized_features:
    trade_data_flattened_trade_history[feature].value_counts().plot(kind='bar', title=feature, figsize=(20, 10))
    plt.show()

In [None]:
quantity_greater_than_100k = lambda row: row['quantity'] >= np.log10(1e5)
quantity_greater_than_1m = lambda row: row['quantity'] >= np.log10(1e6)
trade_type_is_interdealer = lambda row: row['trade_type'] == 'D'

This link has the below definitions and results: https://docs.google.com/document/d/1rQeB3lM_iEyv9q-rseQPmb8n8nv1ay5UtVdSNH0K0QU/edit?usp=sharing.

In [None]:
# key: name of criteria, value: (categories to match, filtering conditions)
related_trades_criterion = {# 'sudhar1': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], []), 
                            # 'sudhar1_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], [quantity_greater_than_100k]), 
                            # 'sudhar1_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], [quantity_greater_than_1m]), 
                            # 'sudhar2': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, 'trade_type'], []), 
                            # 'sudhar2_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, 'trade_type'], [quantity_greater_than_100k]), 
                            # 'sudhar2_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, 'trade_type'], [quantity_greater_than_1m]), 
                            # 'sudhar3': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], [trade_type_is_interdealer]), 
                            # 'sudhar3_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], [quantity_greater_than_100k, trade_type_is_interdealer]), 
                            # 'sudhar3_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL], [quantity_greater_than_1m, trade_type_is_interdealer]), 
                            # 'sudhar4': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, MUNI_SECURITY_TYPE_TOP_VALUES], []), 
                            # 'sudhar4_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, MUNI_SECURITY_TYPE_TOP_VALUES], [quantity_greater_than_100k]), 
                            # 'sudhar4_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, MUNI_SECURITY_TYPE_TOP_VALUES], [quantity_greater_than_1m]), 
                            # 'sudhar5': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], []), 
                            # 'sudhar5_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], [quantity_greater_than_100k]), 
                            # 'sudhar5_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], [quantity_greater_than_1m]), 
                            # 'sudhar6': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], []), 
                            # 'sudhar6_100k': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], [quantity_greater_than_100k]), 
                            # 'sudhar6_1m': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, COUPON_CATEGORICAL, TRADE_DATETIME_DAY], [quantity_greater_than_1m]), 
                            # 'mitas1': ([TRADE_DATETIME_DAY, 'trade_type'], []),
                            # 'mitas1_100k': ([TRADE_DATETIME_DAY, 'trade_type'], [quantity_greater_than_100k]), 
                            # 'mitas1_1m': ([TRADE_DATETIME_DAY, 'trade_type'], [quantity_greater_than_1m]), 
                            # 'desmond': (['incorporated_state_code', RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL, COUPON_CATEGORICAL, PURPOSE_CLASS_TOP_VALUES], []), 
                            # 'desmond_100k': (['incorporated_state_code', RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL, COUPON_CATEGORICAL, PURPOSE_CLASS_TOP_VALUES], [quantity_greater_than_100k]), 
                            # 'desmond_1m': (['incorporated_state_code', RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED, DAYS_TO_MATURITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL, COUPON_CATEGORICAL, PURPOSE_CLASS_TOP_VALUES], [quantity_greater_than_1m]), 
                            # 'yellow': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL, QUANTITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL], []), 
                            # 'yellow_100k': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL, QUANTITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL], [quantity_greater_than_100k]), 
                            # 'yellow_1m': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL, QUANTITY_CATEGORICAL, DAYS_TO_CALL_CATEGORICAL], [quantity_greater_than_1m]), 
                            # 'yellow_lite': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL], []), 
                            # 'yellow_lite_100k': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL], [quantity_greater_than_100k]), 
                            # 'yellow_lite_1m': (['trade_type', DAYS_TO_MATURITY_CATEGORICAL], [quantity_greater_than_1m]), 
                            }

# combine two dictionaries together for Python v3.9+: https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression
related_trades_criterion = related_trades_criterion | \
                           {'NONE': ([], []), 
                            'trade_type': (['trade_type'], []), 
                            'incorporated_state_code': (['incorporated_state_code'], []), 
                            'days_to_maturity_categorical': ([DAYS_TO_MATURITY_CATEGORICAL], []), 
                            'quantity_categorical': ([QUANTITY_CATEGORICAL], []), 
                            'coupon_categorical': ([COUPON_CATEGORICAL], []), 
                            'trade_datetime_day': ([TRADE_DATETIME_DAY], []), 
                            'rating_without_plus_minus_B_NR_combined': ([RATING_WITHOUT_PLUS_MINUS_B_NR_COMBINED], []), 
                            'days_to_call': ([DAYS_TO_CALL_CATEGORICAL], []), 
                            'purpose_class_top_values': ([PURPOSE_CLASS_TOP_VALUES], []), 
                            'muni_security_type_top_values': ([MUNI_SECURITY_TYPE_TOP_VALUES], []), 
                            '100k': ([], [quantity_greater_than_100k]), 
                            '1m': ([], [quantity_greater_than_1m]), 
                            'dd': ([], [trade_type_is_interdealer]), 
                            'rating': (['rating'], []), 
                            'purpose_class': (['purpose_class'], []), 
                            'coupon_categorical_sudhar': ([COUPON_CATEGORICAL_SUDHAR], [])
                            }

Add related trades to the trade data.

In [None]:
%%time
df_encoded = encode_with_label_encoders(trade_data_flattened_trade_history, features_to_exclude=['trade_type']) if ENCODE_REFERENCE_FEATURES else trade_data_flattened_trade_history

In [None]:
trade_data_flattened_trade_history_and_related_trades = dict()
# trade_data_flattened_trade_history = None    # uncomment this line when running LightGBM experiments for data files already created in order to reduce memory overhead

In [None]:
%%time
for name, (categories_to_match, filtering_conditions) in tqdm(related_trades_criterion.items()):
    filename = f'trade_data_flattened_trade_history_and_related_trades_{name}_lgb_error'
    filepath = make_data_filename(filename)
    if os.path.exists(filepath):    # check if a file exists https://www.pythontutorial.net/python-basics/python-check-if-file-exists/
        print(f'Loading dataset for {name} from pickle file {filepath}')
        trade_data_flattened_trade_history_and_related_trades[name] = pd.read_pickle(filepath)    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
    elif name not in trade_data_flattened_trade_history_and_related_trades:
        print(f'Creating dataset for {name} and saving it to {filepath}')
        trade_data_flattened_trade_history_and_related_trades[name] = append_recent_trade_data(trade_data_flattened_trade_history, 
                                                                                               NUM_TRADES_IN_RELATED_TRADE_HISTORY, 
                                                                                               RELATED_TRADE_FEATURE_FUNCTIONS_AND_DEFAULT_VALUES, 
                                                                                               feature_prefix=related_trade_feature_prefix, 
                                                                                               categories=categories_to_match, 
                                                                                               filtering_conditions=filtering_conditions, 
                                                                                               return_df=True, 
                                                                                               multiprocessing=True, 
                                                                                               df_for_related_trades=df_encoded).drop(columns=quantized_features)    # drop the quantized features from the final dataframe
        trade_data_flattened_trade_history_and_related_trades[name].to_pickle(filepath, protocol=4)    # protocol 4 allows for use in the VM: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html

In [None]:
trade_data_flattened_trade_history = trade_data_flattened_trade_history.drop(columns=quantized_features)    # drop the quantized features from the final dataframe

Make sure the each group has a reasonable amount of trades (otherwise finding related trades will be too difficult for certain trades). Note that if a group has a count of 1, then that trade has no previous related trades according to this definition of *related*. Further, note that if a group has a count of 2, then only one of those trades has a single past related trade, and the other one doesn't, where the one with no previous related trade is the oldest trade in this group. We should loosen or tighten the definition of *related* in order to make sure almost all trades have at least one previous related trades.

In [None]:
FEATURE_TO_DETECT_NO_PAST_TRADES = 'seconds_ago'    # arbitrarily chosen, but needs to be a feature that does not naturally have occurrences of its default value
for name, df in trade_data_flattened_trade_history_and_related_trades.items():
    print(f'{name}')
    for past_trade_idx in (0, 1, 15, 31):    # range(2):
        if past_trade_idx < NUM_TRADES_IN_RELATED_TRADE_HISTORY:
            feature_name = get_appended_feature_name(past_trade_idx, FEATURE_TO_DETECT_NO_PAST_TRADES, related_trade_feature_prefix)
            num_trades = (df[feature_name] == DEFAULT_VALUES[FEATURE_TO_DETECT_NO_PAST_TRADES]).sum()
            print(f'Number of trades with fewer than {past_trade_idx + 1} past related trades: {num_trades}. Percentage of total trades: {round(num_trades / len(trade_data) * 100, 3)} %')

Decode the added reference features, if they are encoded.

In [None]:
%%time
if ENCODE_REFERENCE_FEATURES:
    print('Decoding the reference features.')
    for df in tqdm(trade_data_flattened_trade_history_and_related_trades.values()):
        for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD:
            encoder = label_encoders[feature]
            for past_trade_idx in range(NUM_TRADES_IN_RELATED_TRADE_HISTORY):
                feature_name = get_appended_feature_name(past_trade_idx, feature, related_trade_feature_prefix)
                df[feature_name] = encoder.inverse_transform(df[feature_name].to_numpy(dtype=int))    # inverse transform the encoded categorical feature column; must set to dtype=int since label encoder encodes to integers
                df[feature_name] = df[feature_name].astype('category')    # change dtype to `categorical` to use in LightGBM model

Make sure that reference features have dtype categorical. 

In [None]:
for df in tqdm(trade_data_flattened_trade_history_and_related_trades.values()):
    for feature in CATEGORICAL_REFERENCE_FEATURES_TO_ADD:
        if df[feature].dtype.name != 'category': df[feature] = df[feature].astype('category')    # check dtype of a column: https://stackoverflow.com/questions/26924904/check-if-dataframe-column-is-categorical

## Choosing a related trades criteria

In [None]:
past_related_trades_columns_opt, all_categorical_features_in_trade_history_related = get_past_trade_columns(NUM_TRADES_IN_RELATED_TRADE_HISTORY, 
                                                                                                            RELATED_TRADE_FEATURE_FUNCTIONS.keys(), 
                                                                                                            related_trade_feature_prefix, 
                                                                                                            trade_type_actual=True, 
                                                                                                            trade_type_column=TRADE_TYPE_NEW_COLUMN, 
                                                                                                            categorical_features_per_trade=CATEGORICAL_FEATURES_IN_HISTORY + CATEGORICAL_REFERENCE_FEATURES_TO_ADD)

Create column sets for different purposes.

Difference between `related_trades_features` and `past_related_trades_columns_opt` is that the later has `related_last_trade_type` instead of `related_last_trade_type1` and `related_last_trade_type2`.

Difference between `past_trade_feature_groups_flattened` and `past_trades_columns_opt` is that the former has all the `<num>last_trade_type1` and `<num>last_trade_type2`.

In [None]:
target_trade_features = list(set(PREDICTORS_WITHOUT_LAST_TRADE_FEATURES + CATEGORICAL_REFERENCE_FEATURES_TO_ADD)) + TARGET

columns_to_select_to_create_dataframe = target_trade_features + past_trade_feature_groups_flattened + related_trades_features + DATA_PROCESSING_FEATURES
assert len(columns_to_select_to_create_dataframe) == len(set(columns_to_select_to_create_dataframe))    # checks that there are no intersection between the groups of features
columns_to_select_for_lightgbm_model = target_trade_features + past_trade_columns + past_related_trades_columns_opt
assert len(columns_to_select_for_lightgbm_model) == len(set(columns_to_select_for_lightgbm_model))    # checks that there are no intersection between the groups of features

target_trade_categorical_features = list(set(CATEGORICAL_FEATURES + CATEGORICAL_REFERENCE_FEATURES_TO_ADD))

categorical_features_for_lightgbm_model = target_trade_categorical_features + all_categorical_features_in_trade_history + all_categorical_features_in_trade_history_related
assert len(categorical_features_for_lightgbm_model) == len(set(categorical_features_for_lightgbm_model))    # checks that there are no intersection between the groups of features

print(f'Features used for LightGBM model: {sorted(columns_to_select_for_lightgbm_model)}')
print(f'Categorical features used for LightGBM model: {sorted(categorical_features_for_lightgbm_model)}')

Load info from previously run experiments.

In [None]:
related_trades_criterion_losses = dict()
related_trades_criterion_losses_filepath = make_data_filename('related_trades_criterion_losses')
if os.path.exists(related_trades_criterion_losses_filepath):
    print(f'Loading losses from {related_trades_criterion_losses_filepath}')
    with open(related_trades_criterion_losses_filepath, 'rb') as pickle_handle: related_trades_criterion_losses = pickle.load(pickle_handle)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object
    print(f'Already have loss results for: {list(related_trades_criterion_losses.keys())}')

Create runner for experiment.

In [None]:
def get_lgb_error_for_each_related_trade_criterion(trade_data_dict, results_dict, results_dict_pickle_filepath, features, categorical_features):
    for name, df in tqdm(trade_data_dict.items()):
        if name not in results_dict:
            # convert trade_type1 and trade_type2 to trade_type with S, P, D for same CUSIP trades
            trade_data_predictors_history_related_trades_actual_trade_type, _, _ = convert_trade_type_encoding_to_actual(df[columns_to_select_to_create_dataframe], 
                                                                                                                         NUM_TRADES_IN_TRADE_HISTORY, 
                                                                                                                         TRADE_TYPE_NEW_COLUMN, 
                                                                                                                         SAME_CUSIP_PREFIX)
            # convert trade_type1 and trade_type2 to trade_type with S, P, D for related trades
            trade_data_predictors_history_related_trades_actual_trade_type, _, _ = convert_trade_type_encoding_to_actual(trade_data_predictors_history_related_trades_actual_trade_type, 
                                                                                                                         NUM_TRADES_IN_RELATED_TRADE_HISTORY, 
                                                                                                                         TRADE_TYPE_NEW_COLUMN, 
                                                                                                                         related_trade_feature_prefix)

            # split data into train and test set
            train_data_predictors_history_related_trades_actual_trade_type, \
                test_data_predictors_history_related_trades_actual_trade_type = get_train_test_data_trade_datetime(trade_data_predictors_history_related_trades_actual_trade_type, DATE_TO_SPLIT)
            del trade_data_predictors_history_related_trades_actual_trade_type
            gc.collect()
            assert len(train_data_predictors_history_related_trades_actual_trade_type) != 0 and len(test_data_predictors_history_related_trades_actual_trade_type) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'

            print(f'Training the LightGBM model for {name}')
            _, lgb_losses = train_lightgbm_model(train_data_predictors_history_related_trades_actual_trade_type[features], 
                                                 test_data_predictors_history_related_trades_actual_trade_type[features], 
                                                 categorical_features, 
                                                 wandb_project='mitas_trade_history')

            del train_data_predictors_history_related_trades_actual_trade_type
            del test_data_predictors_history_related_trades_actual_trade_type
            gc.collect()

            results_dict[name] = lgb_losses['Train'][0], lgb_losses['Test'][0]
            with open(results_dict_pickle_filepath, 'wb') as pickle_handle: pickle.dump(results_dict, pickle_handle, protocol=4)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object


In [None]:
get_lgb_error_for_each_related_trade_criterion(trade_data_flattened_trade_history_and_related_trades, 
                                               related_trades_criterion_losses, 
                                               related_trades_criterion_losses_filepath, 
                                               columns_to_select_for_lightgbm_model, 
                                               categorical_features_for_lightgbm_model)

In [None]:
for name, (train_loss, test_loss) in related_trades_criterion_losses.items():
    print(f'{name}\t\tTrain error: {train_loss}\tTest error: {test_loss}')
related_trades_criterion_ascending_order_of_test_loss = sorted(related_trades_criterion_losses, key=lambda name: related_trades_criterion_losses.get(name)[1])    # sort by minimum test error (which is represented by index 1)
related_trades_criterion_opt = related_trades_criterion_ascending_order_of_test_loss[0]    # optimal name is the one with the minimum test error

Run identical experiments without using the `lgb_error` in the related trades.

In [None]:
related_trades_wo_lgb_error_criterion_losses = dict()
related_trades_wo_lgb_error_criterion_losses_filepath = make_data_filename('related_trades_wo_lgb_error_criterion_losses')
if os.path.exists(related_trades_wo_lgb_error_criterion_losses_filepath):
    print(f'Loading losses from {related_trades_wo_lgb_error_criterion_losses_filepath}')
    with open(related_trades_wo_lgb_error_criterion_losses_filepath, 'rb') as pickle_handle: related_trades_wo_lgb_error_criterion_losses = pickle.load(pickle_handle)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object
    print(f'Already have loss results for: {list(related_trades_wo_lgb_error_criterion_losses.keys())}')

In [None]:
columns_to_select_for_lightgbm_model_wo_lgb_error = [column for column in columns_to_select_for_lightgbm_model if not column.endswith('lgb_error')]

In [None]:
get_lgb_error_for_each_related_trade_criterion(trade_data_flattened_trade_history_and_related_trades, 
                                               related_trades_wo_lgb_error_criterion_losses, 
                                               related_trades_wo_lgb_error_criterion_losses_filepath, 
                                               columns_to_select_for_lightgbm_model_wo_lgb_error, 
                                               categorical_features_for_lightgbm_model)

In [None]:
for name, df in tqdm(trade_data_flattened_trade_history_and_related_trades.items()):
    if name not in related_trades_wo_lgb_error_criterion_losses:
        # convert trade_type1 and trade_type2 to trade_type with S, P, D for same CUSIP trades
        trade_data_predictors_history_related_trades_actual_trade_type, _, _ = convert_trade_type_encoding_to_actual(df[columns_to_select_to_create_dataframe], 
                                                                                                                     NUM_TRADES_IN_TRADE_HISTORY, 
                                                                                                                     TRADE_TYPE_NEW_COLUMN, 
                                                                                                                     SAME_CUSIP_PREFIX)
        # convert trade_type1 and trade_type2 to trade_type with S, P, D for related trades
        trade_data_predictors_history_related_trades_actual_trade_type, _, _ = convert_trade_type_encoding_to_actual(trade_data_predictors_history_related_trades_actual_trade_type, 
                                                                                                                     NUM_TRADES_IN_RELATED_TRADE_HISTORY, 
                                                                                                                     TRADE_TYPE_NEW_COLUMN, 
                                                                                                                     related_trade_feature_prefix)

        # split data into train and test set
        train_data_predictors_history_related_trades_actual_trade_type, \
            test_data_predictors_history_related_trades_actual_trade_type = get_train_test_data_trade_datetime(trade_data_predictors_history_related_trades_actual_trade_type, DATE_TO_SPLIT)
        del trade_data_predictors_history_related_trades_actual_trade_type
        gc.collect()
        assert len(train_data_predictors_history_related_trades_actual_trade_type) != 0 and len(test_data_predictors_history_related_trades_actual_trade_type) != 0, 'Either train or test data is empty. Consider checking how the train test split is being performed.'

        print(f'Training the LightGBM model for {name}')
        _, lgb_losses = train_lightgbm_model(train_data_predictors_history_related_trades_actual_trade_type[columns_to_select_for_lightgbm_model_without_lgb_error], 
                                             test_data_predictors_history_related_trades_actual_trade_type[columns_to_select_for_lightgbm_model_without_lgb_error], 
                                             categorical_features_for_lightgbm_model, 
                                             wandb_project='mitas_trade_history')

        del train_data_predictors_history_related_trades_actual_trade_type
        del test_data_predictors_history_related_trades_actual_trade_type
        gc.collect()

        related_trades_criterion_losses[name] = lgb_losses['Train'][0], lgb_losses['Test'][0]
        with open(related_trades_criterion_losses_filepath, 'wb') as pickle_handle: pickle.dump(related_trades_criterion_losses, pickle_handle, protocol=4)    # use template from https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict-or-any-other-python-object


In [None]:
for name, (train_loss, test_loss) in related_trades_wo_lgb_error_criterion_losses.items():
    print(f'{name}\t\tTrain error: {train_loss}\tTest error: {test_loss}')
related_trades_wo_lgb_error_criterion_ascending_order_of_test_loss = sorted(related_trades_wo_lgb_error_criterion_losses, key=lambda name: related_trades_wo_lgb_error_criterion_losses.get(name)[1])    # sort by minimum test error (which is represented by index 1)
related_trades_wo_lgb_error_criterion_opt = related_trades_wo_lgb_error_criterion_ascending_order_of_test_loss[0]    # optimal name is the one with the minimum test error

From here, we will only have one dataset; the one with the appended trades coming from `related_trades_criterion_opt`.

In [None]:
trade_data_flattened_trade_history_and_related_trades = trade_data_flattened_trade_history_and_related_trades[related_trades_criterion_opt]
trade_data_predictors_history_related_trades = trade_data_flattened_trade_history_and_related_trades[columns_to_select_to_create_dataframe]
train_data_predictors_history_related_trades, test_data_predictors_history_related_trades = get_train_test_data_trade_datetime(trade_data_predictors_history_related_trades, DATE_TO_SPLIT)
train_data_predictors_history_related_trades = train_data_predictors_history_related_trades.drop(columns=DATA_PROCESSING_FEATURES)
test_data_predictors_history_related_trades = test_data_predictors_history_related_trades.drop(columns=DATA_PROCESSING_FEATURES)

trade_data_predictors_history_related_trades_actual_trade_type, old_trade_type_columns, _ = convert_trade_type_encoding_to_actual(trade_data_predictors_history_related_trades, 
                                                                                                                                  NUM_TRADES_IN_TRADE_HISTORY, 
                                                                                                                                  TRADE_TYPE_NEW_COLUMN, 
                                                                                                                                  SAME_CUSIP_PREFIX)
trade_data_predictors_history_related_trades_actual_trade_type, old_trade_type_columns_related, _ = convert_trade_type_encoding_to_actual(trade_data_predictors_history_related_trades_actual_trade_type, 
                                                                                                                                          NUM_TRADES_IN_RELATED_TRADE_HISTORY, 
                                                                                                                                          TRADE_TYPE_NEW_COLUMN, 
                                                                                                                                          related_trade_feature_prefix)
trade_data_predictors_history_related_trades_actual_trade_type = trade_data_predictors_history_related_trades_actual_trade_type.drop(columns=old_trade_type_columns + old_trade_type_columns_related)

train_data_predictors_history_related_trades_actual_trade_type, \
    test_data_predictors_history_related_trades_actual_trade_type = get_train_test_data_trade_datetime(trade_data_predictors_history_related_trades_actual_trade_type, DATE_TO_SPLIT)

train_data_predictors_history_related_trades_actual_trade_type = train_data_predictors_history_related_trades_actual_trade_type.drop(columns=DATA_PROCESSING_FEATURES)
test_data_predictors_history_related_trades_actual_trade_type = test_data_predictors_history_related_trades_actual_trade_type.drop(columns=DATA_PROCESSING_FEATURES)

trade_data_predictors_history_related_trades = trade_data_predictors_history_related_trades.drop(columns=DATA_PROCESSING_FEATURES)
trade_data_predictors_history_related_trades_actual_trade_type = trade_data_predictors_history_related_trades_actual_trade_type.drop(columns=DATA_PROCESSING_FEATURES)

In [None]:
lgb_model, lgb_losses = train_lightgbm_model(train_data_predictors_history_related_trades_actual_trade_type[columns_to_select_for_lightgbm_model], 
                                             test_data_predictors_history_related_trades_actual_trade_type[columns_to_select_for_lightgbm_model], 
                                             categorical_features_for_lightgbm_model, 
                                             wandb_project='mitas_trade_history')

In [None]:
lgb.plot_importance(lgb_model, figsize=(20, 10), importance_type='gain')

Train the LightGBM model without the `lgb_error` feature in the related trades.

In [None]:
lgb_model_wo_lgb_error, lgb_losses_wo_lgb_error = train_lightgbm_model(train_data_predictors_history_related_trades_actual_trade_type[columns_to_select_for_lightgbm_model_wo_lgb_error], 
                                                                       test_data_predictors_history_related_trades_actual_trade_type[columns_to_select_for_lightgbm_model_wo_lgb_error], 
                                                                       categorical_features_for_lightgbm_model, 
                                                                       wandb_project='mitas_trade_history')

In [None]:
lgb.plot_importance(lgb_model_wo_lgb_error, figsize=(20, 10), importance_type='gain')

The related trades feature `lgb_error` does not provide any gain. See the results in the spreadsheet: https://docs.google.com/spreadsheets/d/15Z97BDO6g1VEw-4iQ-zTA7_tp3thsGg9XJrby3gziKQ/edit#gid=0. 