In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

from collections import defaultdict

import numpy as np
import pandas as pd

from ficc.utils.auxiliary_variables import SPECIAL_CONDITIONS
from ficc.utils.adding_flags import get_most_recent_index_and_others, SPECIAL_CONDITIONS_TO_FILTER_ON

import sys
sys.path.insert(0,'../../../../ficc/ml_models/sequence_predictors/')

from yield_spread_model_mitas.data_prep import get_datestring_from_filename, \
                                               replace_rating_with_standalone_rating, \
                                               remove_rows_with_feature_value

from rating_model_mitas.data_prep import read_processed_file_pickle

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Silence the following warning: `SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.`

In [2]:
pd.options.mode.chained_assignment = None

# Load the dataframe

In [3]:
%%time
processed_file_pickle = '../data/processed_data_2022-06-13-19-19.pkl'
processed_file_pickle_datestring = get_datestring_from_filename(processed_file_pickle)
trade_data = read_processed_file_pickle(processed_file_pickle)
# ensure that all column names are unique
assert len(trade_data.columns) == len(set(trade_data)), 'Not all column names are unique'

START: Reading from processed file at ../data/processed_data_2022-06-13-19-19.pkl
END: Reading from processed file at ../data/processed_data_2022-06-13-19-19.pkl
CPU times: user 2.53 s, sys: 1.64 s, total: 4.17 s
Wall time: 5.45 s


## Exclusions

In [4]:
trade_data = trade_data[(trade_data.days_to_call == 0) | (trade_data.days_to_call > np.log10(400))]
trade_data = trade_data[(trade_data.days_to_refund == 0) | (trade_data.days_to_refund > np.log10(400))]
trade_data = trade_data[trade_data.days_to_maturity < np.log10(30000)]
trade_data = trade_data[trade_data.sinking == False]
trade_data = trade_data[trade_data.incorporated_state_code != 'VI']
trade_data = trade_data[trade_data.incorporated_state_code != 'GU']
trade_data = trade_data[(trade_data.coupon_type == 8)]
trade_data = trade_data[trade_data.is_called == False]

In [5]:
# restructured bonds and high chance of default bonds are removed
trade_data = remove_rows_with_feature_value(trade_data, 'purpose_sub_class', [6, 20, 21, 22, 44, 57, 90, 106])
# pre-refunded bonds and partially refunded bonds are removed
trade_data = remove_rows_with_feature_value(trade_data, 'called_redemption_type', [18, 19])

(df["purpose_sub_class"] != 6) & (df["purpose_sub_class"] != 20) & (df["purpose_sub_class"] != 21) & (df["purpose_sub_class"] != 22) & (df["purpose_sub_class"] != 44) & (df["purpose_sub_class"] != 57) & (df["purpose_sub_class"] != 90) & (df["purpose_sub_class"] != 106)
39566 rows had purpose_sub_class in [6, 20, 21, 22, 44, 57, 90, 106] and were removed
(df["called_redemption_type"] != 18) & (df["called_redemption_type"] != 19)
11590 rows had called_redemption_type in [18, 19] and were removed


In [6]:
trade_data = replace_rating_with_standalone_rating(trade_data)
trade_data_original = trade_data.copy()

In [7]:
print(f'Number of trades: {len(trade_data)}')

Number of trades: 2401047


In [8]:
FEATURES_OF_INTEREST = ['cusip', 'quantity', 'dollar_price', 'trade_datetime', 'trade_type']
IDENTIFIERS = ['cusip', 'rtrs_control_number']
ALL_IMPORTANT_FEATURES = list(set().union(FEATURES_OF_INTEREST + IDENTIFIERS + SPECIAL_CONDITIONS))

Removing unnecessary columns speeds up the upcoming `groupby` operation.

In [9]:
trade_data_select_features = trade_data[ALL_IMPORTANT_FEATURES]
trade_data_select_features['brokers_broker'] = trade_data_select_features['brokers_broker'].astype('string').fillna('none')    # replace the NaN value with 'none' so that we can use groupby (groupby doesn't work for NaN even with the dropna flag)

# Combining and removing trades
Currently, we are only combining (with other trades) and removing trades (based on other trades) that occur on the same day as the other trades. We assume that two trades occur *around* the same time, if the two trades occur on the same day.
## Combining trades
1. If two or more non-interdealer trades (occurring at around the same time) have the same price and direction, and the same flags, they can be combined together. We combine the group of trades by updating the quantity of the most recent trade to sum of the quantities in the group, and remove all but the most recent trade. Note that we can still combine trades even if there are other trades (for the same CUSIP) in between the trades that we are trying to combine.
## Removing trades
1. If two or more inter-dealer trades (occurring at around the same time) have the same price and the same flags, only the most recent one can be kept since this can be interpreted as a series of trades between related legal entities.
2. If two trades (occurring at around the same time) have the same price and same trade amount, and one of the trades is an inter-dealer trade, and the other one is not and has a non-transaction-based compensation flag, then only the inter-dealer trade is the meaningful trade.
3. If two trades (occurring at around the same time) have the same price and same trade amount, and one of the trades is an inter-dealer trade, and the other trade is not and does not have a non-transaction-based compensation flag, then only the other trade is the meaningful trade.

In [10]:
%%time
groups_same_day_quantity_price_cusip_tradetype_flags = trade_data_select_features.groupby([pd.Grouper(key='trade_datetime', freq='1D'), 'quantity', 'dollar_price', 'cusip', 'trade_type'] + SPECIAL_CONDITIONS_TO_FILTER_ON)
groups_same_day_quantity_price_cusip_tradetype_flags = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip_tradetype_flags if len(group_df) > 1}    # removes groups with only 1 item since this is not really a group
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_tradetype_flags)}')

Number of groups: 115504
CPU times: user 1min 3s, sys: 885 ms, total: 1min 4s
Wall time: 1min 4s


In [11]:
groups_same_day_quantity_price_cusip_tradetype_flags_nonDD = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip_tradetype_flags.items() if set(group_df['trade_type']) == {'S'} or set(group_df['trade_type']) == {'P'}}
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_tradetype_flags_nonDD)}')

Number of groups: 27375


In [12]:
def combine_trades(df, group_df):
    most_recent_trade_index, indices_to_remove = get_most_recent_index_and_others(group_df)
    new_total_quantity = np.log10(sum(10 ** group_df['quantity']))    # undo log10 transformation before sum and reapply log10 transformation after sum
    df.loc[most_recent_trade_index]['quantity'] = new_total_quantity
    return df, indices_to_remove

In [13]:
%%time
all_indices_to_remove = []
for group_df in groups_same_day_quantity_price_cusip_tradetype_flags_nonDD.values():
    # does it matter if there are trades in between two trades that are being combined? maybe not from discussion on CUSIP 74265LS66
    trade_data, row_indices_to_remove = combine_trades(trade_data, group_df)
    all_indices_to_remove.extend(row_indices_to_remove)
trade_data = trade_data.drop(all_indices_to_remove)
print(f'Number of trades: {len(trade_data)}')

Number of trades: 2361807
CPU times: user 4min, sys: 1.13 s, total: 4min 1s
Wall time: 4min 1s


In [14]:
groups_same_day_quantity_price_cusip_tradetype_flags_DD = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip_tradetype_flags.items() if set(group_df['trade_type']) == {'D'}}
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_tradetype_flags_DD)}')

Number of groups: 88129


In [15]:
%%time
all_indices_to_remove = []
for group_df in groups_same_day_quantity_price_cusip_tradetype_flags_DD.values():
    _, indices_to_remove = get_most_recent_index_and_others(group_df, True)
    all_indices_to_remove.extend(indices_to_remove)
trade_data = trade_data.drop(all_indices_to_remove)
print(f'Number of trades: {len(trade_data)}')

Number of trades: 2261999
CPU times: user 13.9 s, sys: 1.21 s, total: 15.1 s
Wall time: 15.7 s


In [16]:
%%time
groups_same_day_quantity_price_cusip = trade_data_select_features.groupby([pd.Grouper(key='trade_datetime', freq='1D'), 'quantity', 'dollar_price', 'cusip'])
groups_same_day_quantity_price_cusip = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip if len(group_df) > 1}    # removes groups with only 1 item since this is not really a group
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip)}')

Number of groups: 333637
CPU times: user 58.4 s, sys: 1.02 s, total: 59.4 s
Wall time: 59.3 s


In [17]:
two_trades_only_one_is_dd = lambda df: len(df) == 2 and 'D' in set(df['trade_type']) and len(set(df['trade_type'])) == 2    # checks whether `df` has two trades where only one is an inter-dealer trade
groups_same_day_quantity_price_cusip_dd_2 = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip.items() if two_trades_only_one_is_dd(group_df)}
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_dd_2)}')

Number of groups: 213430


In [18]:
%%time
all_indices_to_remove = []
for group, group_df in groups_same_day_quantity_price_cusip_dd_2.items():
    interdealer_trade_index = group_df[group_df['trade_type'] == 'D'].index[0]
    other_trade = group_df[group_df['trade_type'] != 'D']
    other_trade_index = other_trade.index[0]
    if other_trade['is_non_transaction_based_compensation'].values[0]:    # .values[0] isolates the value for this trade
        all_indices_to_remove.append(other_trade_index)
    else:
        all_indices_to_remove.append(interdealer_trade_index)
trade_data = trade_data.drop(all_indices_to_remove)
print(f'Number of trades: {len(trade_data)}')

Number of trades: 2048569
CPU times: user 1min 8s, sys: 2.05 s, total: 1min 10s
Wall time: 1min 10s


In [19]:
print(f'Number of trades in original dataframe: {len(trade_data_original)}')
print(f'Number of trades after combining and removing trades: {len(trade_data)}')
print(f'Number of trades removed: {len(trade_data_original) - len(trade_data)}')

Number of trades in original dataframe: 2401047
Number of trades after combining and removing trades: 2048569
Number of trades removed: 352478


# Experiment 1: Grouping with flags
*Question*: Does filtering by trade flags (i.e., special condition indicators) greatly reduce the number of groups of trades to combine?

*Hypothesis*: No; I believe that the flags are not used very much. Furthermore, I believe that trades that should be combined should have all the same flags.

*Results*: When adding in the flags `is_non_transaction_based_compensation` and `is_lop_or_takedown`, there is minimal reduction of groups (goes from 116,663 to 116,194). When adding in the `is_alternative_trading_system` flag, the number of groups drops by about 40% (goes from 116,194 to 73,373). When adding in the `brokers_broker` flag, since most values are `NaN`, this causes the `groupby` command to ignore these groups, and the groups drop dramatically. The `dropna` argument does not work in `groupby` which is a bug in pandas (this was supposedly fixed in more recent versions of pandas, but it still doesn't work for me even with the most up to date version of 1.4.3). 

*Conclusions*: By including certain flags, the number of groups does drop dramatically, so my hypothesis as a whole was incorrect.

*Future work*: This begs the following two questions: (1) can we group trades where one of them was done with the `is_alternative_trading_system` flag and one without?, and (2) what is the best way to handle `NaN` values for `brokers_broker`, i.e., what does `NaN` mean for `brokers_broker`?

In [None]:
%%time
groups_same_day_quantity_price_cusip_tradetype_flags_test = trade_data_select_features.groupby([pd.Grouper(key='trade_datetime', freq='1D'), 'quantity', 'dollar_price', 'cusip', 'trade_type'])
groups_same_day_quantity_price_cusip_tradetype_flags_test = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip_tradetype_flags_test if len(group_df) > 1}    # removes groups with only 1 item since this is not really a group
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_tradetype_flags_test)}')

In [None]:
%%time
groups_same_day_quantity_price_cusip_tradetype_flags_test = trade_data_select_features.groupby([pd.Grouper(key='trade_datetime', freq='1D'), 'quantity', 'dollar_price', 'cusip', 'trade_type'] + ['is_non_transaction_based_compensation'])
groups_same_day_quantity_price_cusip_tradetype_flags_test = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip_tradetype_flags_test if len(group_df) > 1}    # removes groups with only 1 item since this is not really a group
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_tradetype_flags_test)}')

In [None]:
%%time
groups_same_day_quantity_price_cusip_tradetype_flags_test = trade_data_select_features.groupby([pd.Grouper(key='trade_datetime', freq='1D'), 'quantity', 'dollar_price', 'cusip', 'trade_type'] + ['is_non_transaction_based_compensation', 'is_lop_or_takedown'])
groups_same_day_quantity_price_cusip_tradetype_flags_test = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip_tradetype_flags_test if len(group_df) > 1}    # removes groups with only 1 item since this is not really a group
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_tradetype_flags_test)}')

In [None]:
%%time
groups_same_day_quantity_price_cusip_tradetype_flags_test = trade_data_select_features.groupby([pd.Grouper(key='trade_datetime', freq='1D'), 'quantity', 'dollar_price', 'cusip', 'trade_type'] + ['is_non_transaction_based_compensation', 'is_lop_or_takedown', 'brokers_broker'])
groups_same_day_quantity_price_cusip_tradetype_flags_test = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip_tradetype_flags_test if len(group_df) > 1}    # removes groups with only 1 item since this is not really a group
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_tradetype_flags_test)}')

In [None]:
%%time
groups_same_day_quantity_price_cusip_tradetype_flags_test = trade_data_select_features.groupby([pd.Grouper(key='trade_datetime', freq='1D'), 'quantity', 'dollar_price', 'cusip', 'trade_type'] + ['is_non_transaction_based_compensation', 'is_lop_or_takedown', 'is_alternative_trading_system'])
groups_same_day_quantity_price_cusip_tradetype_flags_test = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip_tradetype_flags_test if len(group_df) > 1}    # removes groups with only 1 item since this is not really a group
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_tradetype_flags_test)}')

In [None]:
%%time
groups_same_day_quantity_price_cusip_tradetype_flags_test = trade_data_select_features.groupby([pd.Grouper(key='trade_datetime', freq='1D'), 'quantity', 'dollar_price', 'cusip', 'trade_type'] + ['brokers_broker'])
groups_same_day_quantity_price_cusip_tradetype_flags_test = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip_tradetype_flags_test if len(group_df) > 1}    # removes groups with only 1 item since this is not really a group
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_tradetype_flags_test)}')

In [None]:
%%time
groups_same_day_quantity_price_cusip_tradetype_flags_test = trade_data_select_features.groupby([pd.Grouper(key='trade_datetime', freq='1D'), 'quantity', 'dollar_price', 'cusip', 'trade_type'] + ['brokers_broker'], dropna=False)
groups_same_day_quantity_price_cusip_tradetype_flags_test = {group_key: group_df for group_key, group_df in groups_same_day_quantity_price_cusip_tradetype_flags_test if len(group_df) > 1}    # removes groups with only 1 item since this is not really a group
print(f'Number of groups: {len(groups_same_day_quantity_price_cusip_tradetype_flags_test)}')

In [None]:
groups_same_day_quantity_price_cusip_tradetype_flags_test = trade_data_select_features.groupby(['brokers_broker'], dropna=False)
for group, group_df in groups_same_day_quantity_price_cusip_tradetype_flags_test:
    print(group)

In [None]:
print(trade_data_select_features['brokers_broker'].value_counts(dropna=False))
print(trade_data_select_features['is_alternative_trading_system'].value_counts(dropna=False))
print(trade_data_select_features['is_lop_or_takedown'].value_counts(dropna=False))
print(trade_data_select_features['is_non_transaction_based_compensation'].value_counts(dropna=False))

# Old code / testing / sanity checks

In [None]:
df = pd.DataFrame({'a': [1, 1, 3], 'b': [1, 5, 6], 'c': [7, 8, 6], 'd': [1, 1, 3]})
df

In [None]:
groups = df.groupby(['a', 'd'])
all_indices_to_remove = []
for group, group_df in groups:
    indices_to_remove = group_df.index.to_list()
    most_recent_trade_index = indices_to_remove.pop()
    print(most_recent_trade_index)
    print(df)
    df.loc[most_recent_trade_index]['c'] = sum(group_df['c'])
    all_indices_to_remove.extend(indices_to_remove)
df = df.drop(all_indices_to_remove)
df

In [None]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], index=['cobra', 'viper', 'sidewinder'], columns=['max_speed', 'shield'])
df

In [None]:
df = df.drop(['viper', 'sidewinder'])
df

In [None]:
# make sure there are not trades that are dealer-dealer and have the `is_non_transaction_based_compensation` flag
trade_data[trade_data['is_non_transaction_based_compensation'] & (trade_data['trade_type'] == 'D')]