In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

import os

import numpy as np
import pandas as pd

from ficc.utils.auxiliary_variables import IS_REPLICA, IS_BOOKKEEPING, IS_SAME_DAY, NTBC_PRECURSOR, REPLICA_COUNT
from ficc.utils.adding_flags import add_replica_flag, add_bookkeeping_flag, add_same_day_flag, add_ntbc_precursor_flag, add_replica_count_flag

import sys
sys.path.insert(0,'../../../../ficc/ml_models/sequence_predictors/')

from yield_spread_model_mitas.data_prep import replace_rating_with_standalone_rating, \
                                               remove_rows_with_feature_value

from rating_model_mitas.data_prep import read_processed_file_pickle

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
from datetime import datetime    # used in synthetic examples

In [None]:
%%time
filename = 'processed_data_ficc_ycl_2021-12-31-23-59'
filename_with_file_location = lambda filename: f'../data/{filename}.pkl'
processed_file_exclusions_all_flags_pickle = filename_with_file_location(filename + '_exclusions_replica_bookkeeping_sameday_ntbcprecursor')

trade_data = read_processed_file_pickle(processed_file_exclusions_all_flags_pickle)

In [None]:
%%time
if trade_data is None: 
    processed_file_pickle = filename_with_file_location(filename)
    trade_data = read_processed_file_pickle(processed_file_pickle)
    # ensure that all column names are unique
    assert len(trade_data.columns) == len(set(trade_data)), 'Not all column names are unique'

    trade_data = trade_data[(trade_data.days_to_call == 0) | (trade_data.days_to_call > np.log10(400))]
    trade_data = trade_data[(trade_data.days_to_refund == 0) | (trade_data.days_to_refund > np.log10(400))]
    trade_data = trade_data[trade_data.days_to_maturity < np.log10(30000)]
    trade_data = trade_data[trade_data.sinking == False]
    trade_data = trade_data[trade_data.incorporated_state_code != 'VI']
    trade_data = trade_data[trade_data.incorporated_state_code != 'GU']
    trade_data = trade_data[(trade_data.coupon_type == 8)]
    trade_data = trade_data[trade_data.is_called == False]

    # restructured bonds and high chance of default bonds are removed
    trade_data = remove_rows_with_feature_value(trade_data, 'purpose_sub_class', [6, 20, 21, 22, 44, 57, 90, 106])
    # pre-refunded bonds and partially refunded bonds are removed
    trade_data = remove_rows_with_feature_value(trade_data, 'called_redemption_type', [18, 19])
    trade_data = replace_rating_with_standalone_rating(trade_data)

In [None]:
trade_data_original = trade_data.copy()

In [None]:
oldest_trade_datetime = trade_data['trade_datetime'].iloc[-1]
newest_trade_datetime = trade_data['trade_datetime'].iloc[0]
print(f'Oldest trade datetime: {oldest_trade_datetime}.\
    Newest trade datetime: {newest_trade_datetime}.\
    Gap: {newest_trade_datetime - oldest_trade_datetime}')

In [None]:
print(f'Number of trades: {len(trade_data)}')

# Replica flag
Mark a trade as a replica if there is any other trade on the same day with the same price, same direction, and same par_traded. The idea of marking these trades is to exclude them from the trade history, as these trades are probably being sold in the same block, and so having all of these trades in the trade history would be less economically meaningful in the trade history.

In [None]:
%%time
trade_data = add_replica_flag(trade_data)

In [None]:
print(f'Number of trades: {len(trade_data)}')
print(f'Number of trades marked with the {IS_REPLICA} flag: {trade_data[IS_REPLICA].sum()}')
print(f'Percentage of trades marked with the {IS_REPLICA} flag: {trade_data[IS_REPLICA].sum() / len(trade_data) * 100} %')

# Replica count flag
This numerical flag denotes the number of trades with the same trade_date, cusip, quantity, dollar_price, and trade_type that occur before the trade. The intuition here is that if a trade has a large replica count, then it can be downweighted when training the model, since the trade has been seen before.

## Real data

In [None]:
%%time
trade_data = add_replica_count_flag(trade_data)

In [None]:
print(f'Number of trades: {len(trade_data)}')
print(f'Number of trades marked with the {REPLICA_COUNT} flag: {(trade_data[REPLICA_COUNT] > 0).sum()}')
print(f'Percentage of trades marked with the {REPLICA_COUNT} flag: {(trade_data[REPLICA_COUNT] > 0).sum() / len(trade_data) * 100} %')
print(f'Value counts of {REPLICA_COUNT} trades:')
trade_data[REPLICA_COUNT].value_counts()

## Synthetic data

In [None]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'dollar_price': [110.09, 
                                    110.09], 
                   'quantity': [100, 
                                100], 
                   'trade_type': ['S', 
                                  'S'], 
                   'cusip': [1, 1]})
df

In [None]:
df = add_replica_count_flag(df)
assert np.array_equal(df[REPLICA_COUNT].values, [1, 0]), df[REPLICA_COUNT].values
df

In [None]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'dollar_price': [110.09, 
                                    110.09, 
                                    110.09], 
                   'quantity': [100, 
                                100, 
                                100], 
                   'trade_type': ['S', 
                                  'S', 
                                  'S'], 
                   'cusip': [1, 1, 1]})
df

In [None]:
df = add_replica_count_flag(df)
assert np.array_equal(df[REPLICA_COUNT].values, [2, 1, 0]), df[REPLICA_COUNT].values
df

In [None]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'dollar_price': [110.09, 
                                    111.09, 
                                    110.09, 
                                    110.09], 
                   'quantity': [100, 
                                100, 
                                100, 
                                100], 
                   'trade_type': ['S', 
                                  'S', 
                                  'S', 
                                  'S'], 
                   'cusip': [1, 1, 1, 1]})
df

In [None]:
df = add_replica_count_flag(df)
assert np.array_equal(df[REPLICA_COUNT].values, [2, 0, 1, 0]), df[REPLICA_COUNT].values
df

# Bookkeeping flag
We define an inter-dealer trade as *bookkeeping* if there are multiple inter-dealer trades of the same par_traded at the same price for a particular day. The intuition here is that this bond is moving from desk to desk. We mark all of the trades in this group as *bookkeeping*. This makes the trades marked with the *bookkeeping* flag a subset of those marked with the *replica* flag, where the *bookkeeping* flag has an additional condition of being an inter-dealer trade.

Old (and not done anymore): All, but the most recent one, in this group are marked as *bookkeeping*. The choice of not marking the most recent one as opposed to not marking the oldest one or marking all of the trades in the group was made arbitarily, with the intuition that one of the trades in this sequence of inter-dealer trades must not be *bookkeeping*.

In [None]:
%%time
trade_data = add_bookkeeping_flag(trade_data)

In [None]:
print(f'Number of trades: {len(trade_data)}')
print(f'Number of trades marked with the {IS_BOOKKEEPING} flag: {trade_data[IS_BOOKKEEPING].sum()}')
print(f'Percentage of trades marked with the {IS_BOOKKEEPING} flag: {trade_data[IS_BOOKKEEPING].sum() / len(trade_data) * 100} %')
print(f'Number of inter-dealer trades: {(trade_data["trade_type"] == "D").sum()}')
print(f'Percentage of inter-dealer trades marked with the {IS_BOOKKEEPING} flag: {trade_data[IS_BOOKKEEPING].sum() / (trade_data["trade_type"] == "D").sum() * 100} %')

# NTBC precursor flag
This flag denotes an inter-dealer trade that is occurs on the same day as a non-transaction-based-compensation customer trade with the same price and par_traded. The idea for marking it is that this inter-dealer trade may not be genuine (i.e., window-dressing). Note that we have a buffer of occurring on the same day since we see examples in the data (e.g., cusip 549696RS3, trade_datetime 2022-04-01) having the corresponding inter-dealer trade occurring 4 seconds before, instead of the exact same time, as the customer bought trade. 

## Real data

In [None]:
%%time
trade_data = add_ntbc_precursor_flag(trade_data, use_parallel_apply=False)

In [None]:
print(f'Number of trades: {len(trade_data)}')
print(f'Number of trades marked with the {NTBC_PRECURSOR} flag: {trade_data[NTBC_PRECURSOR].sum()}')
print(f'Percentage of trades marked with the {NTBC_PRECURSOR} flag: {trade_data[NTBC_PRECURSOR].sum() / len(trade_data) * 100} %')
print(f'Number of inter-dealer trades: {(trade_data["trade_type"] == "D").sum()}')
print(f'Percentage of inter-dealer trades marked with the {NTBC_PRECURSOR} flag: {trade_data[NTBC_PRECURSOR].sum() / (trade_data["trade_type"] == "D").sum() * 100} %')

## Synthetic examples

In [None]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'dollar_price': [110.09, 
                                    110.09], 
                   'quantity': [100, 
                                100], 
                   'trade_type': ['S', 
                                  'D'], 
                   'cusip': [1, 1], 
                   'rtrs_control_number': [1, 1], 
                   'is_non_transaction_based_compensation': [False, False]})
df

In [None]:
df = add_ntbc_precursor_flag(df, use_parallel_apply=False)
assert np.array_equal(df[NTBC_PRECURSOR].values, [False, False]), df[NTBC_PRECURSOR].values
df

In [None]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'dollar_price': [110.09, 
                                    110.09], 
                   'quantity': [100, 
                                100], 
                   'trade_type': ['P', 
                                  'D'], 
                   'cusip': [1, 1], 
                   'rtrs_control_number': [1, 1], 
                   'is_non_transaction_based_compensation': [True, False]})
df

In [None]:
df = add_ntbc_precursor_flag(df, use_parallel_apply=False)
assert np.array_equal(df[NTBC_PRECURSOR].values, [False, True]), df[NTBC_PRECURSOR].values
df

In [None]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'dollar_price': [110.09, 
                                    110.09], 
                   'quantity': [100, 
                                100], 
                   'trade_type': ['P', 
                                  'D'], 
                   'cusip': [1, 1], 
                   'rtrs_control_number': [1, 1], 
                   'is_non_transaction_based_compensation': [True, False]})
df

In [None]:
df = add_ntbc_precursor_flag(df, use_parallel_apply=False)
assert np.array_equal(df[NTBC_PRECURSOR].values, [False, True]), df[NTBC_PRECURSOR].values
df

In [None]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'dollar_price': [100.09, 
                                    110.09], 
                   'quantity': [100, 
                                100], 
                   'trade_type': ['P', 
                                  'D'], 
                   'cusip': [1, 1], 
                   'rtrs_control_number': [1, 1], 
                   'is_non_transaction_based_compensation': [True, False]})
df

In [None]:
df = add_ntbc_precursor_flag(df, use_parallel_apply=False)
assert np.array_equal(df[NTBC_PRECURSOR].values, [False, False]), df[NTBC_PRECURSOR].values
df

In [None]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'dollar_price': [110.09,  
                                    110.09, 
                                    109.09], 
                   'quantity': [100, 
                                100, 
                                100], 
                   'trade_type': ['S', 
                                  'D', 
                                  'P'], 
                   'cusip': [1, 1, 1], 
                   'rtrs_control_number': [1, 1, 1], 
                   'is_non_transaction_based_compensation': [True, False, False]})
df

In [None]:
df = add_ntbc_precursor_flag(df, use_parallel_apply=False)
assert np.array_equal(df[NTBC_PRECURSOR].values, [False, True, False]), df[NTBC_PRECURSOR].values
df

# Same day flag
This flag denotes a trade where the dealer had everything lined up beforehand, since it all occurred on the same day. Intuitively, since the dealer did not have to hold the bond overnight, we claim that the dealer did not take any risk. Risk is an important part of the price, since if the dealer did not have to take a risk, then the dealer may be more willing to buy it at a lower price or sell it at a higher price. Our logic for identifying trades that are *same day* are as follows:
1. A group of dealer sell trades are considered *same day* if the total cost of the dealer purchase trades for that day is equal to or greater than the total cost of the dealer sell trades. In this case, a group of dealer purchase trades are considered *same day* if there is a continuous (continuous defined as a dealer purchase trade not skipped over chronologically) sequence of dealer purchase trades that equal the total cost of the dealer sell trades. We assume this sequence of dealer purchase trades includes either the first dealer purchase trade of the day and/or the last dealer purchase trade of the day. We may expand this criteria to not have to include either the first and/or last dealer purchase trade.
2. An inter-dealer trade is considered *same day* if the par_traded is equal to the total cost of the dealer sell trades for that day and if the total cost of the dealer purchase trades for that day is greater than or equal to the total cost of the dealer sell trades.

## Real data

In [None]:
%%time
trade_data = add_same_day_flag(trade_data, use_parallel_apply=False)

In [None]:
print(f'Number of trades: {len(trade_data)}')
print(f'Number of trades marked with the {IS_SAME_DAY} flag: {trade_data[IS_SAME_DAY].sum()}')
print(f'Percentage of trades marked with the {IS_SAME_DAY} flag: {trade_data[IS_SAME_DAY].sum() / len(trade_data) * 100} %')

## Synthetic examples
A trade type of `D` corresponds to an inter-dealer trade. A trade type of `S` corresponds to a dealer sell trade. A trade type of `P` corresponds to a dealer purchase trade.

In [9]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14),  
                                  datetime(2022, 7, 14)], 
                   'price': [110.09, 
                             109.09], 
                   'par_traded': [100, 
                                  100], 
                   'trade_type': ['S', 
                                  'P'], 
                   'cusip': [1, 1]})
df

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Unnamed: 0,trade_date,price,par_traded,trade_type,cusip
0,2022-07-14,110.09,100,S,1
1,2022-07-14,109.09,100,P,1


In [17]:
df = add_same_day_flag(df, use_parallel_apply=False)
assert np.array_equal(df[IS_SAME_DAY].values, [True, True])
df

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Unnamed: 0,trade_date,price,par_traded,trade_type,cusip,is_same_day
0,2022-07-14,110.09,100.0,S,1,True
1,2022-07-14,109.09,100.0,P,1,True


In [18]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'price': [111.09,  
                             110.09, 
                             109.09], 
                   'par_traded': [100, 
                                  100, 
                                  100], 
                   'trade_type': ['S', 
                                  'D', 
                                  'P'], 
                   'cusip': [1, 1, 1]})
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip
0,2022-07-14,111.09,100,S,1
1,2022-07-14,110.09,100,D,1
2,2022-07-14,109.09,100,P,1


In [19]:
df = add_same_day_flag(df, use_parallel_apply=False)
assert np.array_equal(df[IS_SAME_DAY].values, [True, False, True])
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip,is_same_day
0,2022-07-14,111.09,100.0,S,1,True
1,2022-07-14,110.09,100.0,D,1,False
2,2022-07-14,109.09,100.0,P,1,True


In [20]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'price': [111.09,  
                             110.09, 
                             110.09, 
                             110.09, 
                             109.09], 
                   'par_traded': [100, 
                                  100,  
                                  100, 
                                  100,
                                  100], 
                   'trade_type': ['S', 
                                  'D', 
                                  'D', 
                                  'D', 
                                  'P'], 
                   'cusip': [1, 1, 1, 1, 1]})
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip
0,2022-07-14,111.09,100,S,1
1,2022-07-14,110.09,100,D,1
2,2022-07-14,110.09,100,D,1
3,2022-07-14,110.09,100,D,1
4,2022-07-14,109.09,100,P,1


In [21]:
df = add_same_day_flag(df, use_parallel_apply=False)
assert np.array_equal(df[IS_SAME_DAY].values, [True, False, False, False, True])
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip,is_same_day
0,2022-07-14,111.09,100.0,S,1,True
1,2022-07-14,110.09,100.0,D,1,False
2,2022-07-14,110.09,100.0,D,1,False
3,2022-07-14,110.09,100.0,D,1,False
4,2022-07-14,109.09,100.0,P,1,True


In [22]:
# motivated by CUSIP: 232287EZ1 on 07/20/2022
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'price': [111.09, 
                             111.09, 
                             111.09, 
                             110.09, 
                             111.09], 
                   'par_traded': [1560, 
                                  400, 
                                  230, 
                                  2190, 
                                  2190], 
                   'trade_type': ['S', 
                                  'S', 
                                  'S', 
                                  'P', 
                                  'D'], 
                   'cusip': [1, 1, 1, 1, 1]})
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip
0,2022-07-14,111.09,1560,S,1
1,2022-07-14,111.09,400,S,1
2,2022-07-14,111.09,230,S,1
3,2022-07-14,110.09,2190,P,1
4,2022-07-14,111.09,2190,D,1


In [23]:
df = add_same_day_flag(df, use_parallel_apply=False)
assert np.array_equal(df[IS_SAME_DAY].values, [True, True, True, True, False])
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip,is_same_day
0,2022-07-14,111.09,1560.0,S,1,True
1,2022-07-14,111.09,400.0,S,1,True
2,2022-07-14,111.09,230.0,S,1,True
3,2022-07-14,110.09,2190.0,P,1,True
4,2022-07-14,111.09,2190.0,D,1,False


In [24]:
# motivated by CUSIP: 232287EZ1 on 07/20/2022
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14),
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'price': [111.09, 
                             111.09, 
                             111.09, 
                             109.848, 
                             109.848, 
                             110.09, 
                             111.09], 
                   'par_traded': [1560, 
                                  400, 
                                  230, 
                                  20, 
                                  20, 
                                  2190, 
                                  2190], 
                   'trade_type': ['S', 
                                  'S', 
                                  'S', 
                                  'P', 
                                  'D', 
                                  'P', 
                                  'D'], 
                   'cusip': [1, 1, 1, 1, 1, 1, 1]})
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip
0,2022-07-14,111.09,1560,S,1
1,2022-07-14,111.09,400,S,1
2,2022-07-14,111.09,230,S,1
3,2022-07-14,109.848,20,P,1
4,2022-07-14,109.848,20,D,1
5,2022-07-14,110.09,2190,P,1
6,2022-07-14,111.09,2190,D,1


In [25]:
df = add_same_day_flag(df, use_parallel_apply=False)
assert np.array_equal(df[IS_SAME_DAY].values, [False, False, False, False, False, False, False]), f'Received: {df[IS_SAME_DAY].values}'
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip,is_same_day
0,2022-07-14,111.09,1560.0,S,1,False
1,2022-07-14,111.09,400.0,S,1,False
2,2022-07-14,111.09,230.0,S,1,False
3,2022-07-14,109.848,20.0,P,1,False
4,2022-07-14,109.848,20.0,D,1,False
5,2022-07-14,110.09,2190.0,P,1,False
6,2022-07-14,111.09,2190.0,D,1,False


In [26]:
# motivated by CUSIP: 59333PL30 on 06/10/2022
df = pd.DataFrame({'trade_date': [datetime(2022, 6, 10), 
                                  datetime(2022, 6, 10), 
                                  datetime(2022, 6, 10), 
                                  datetime(2022, 6, 10), 
                                  datetime(2022, 6, 10), 
                                  datetime(2022, 6, 10), 
                                  datetime(2022, 6, 10), 
                                  datetime(2022, 6, 10), 
                                  datetime(2022, 6, 10), 
                                  datetime(2022, 6, 10)], 
                   'price': [104.364, 
                             104.114, 
                             104.364, 
                             104.364, 
                             104.114, 
                             104.114, 
                             104.004, 
                             104.064, 
                             104.004, 
                             104.064], 
                   'par_traded': [5, 
                                5, 
                                5, 
                                2.5, 
                                2.5, 
                                5, 
                                2.5, 
                                2.5, 
                                10, 
                                10], 
                   'trade_type': ['S', 
                                  'D', 
                                  'S', 
                                  'S', 
                                  'D', 
                                  'D', 
                                  'D', 
                                  'D', 
                                  'D', 
                                  'D'], 
                   'cusip': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip
0,2022-06-10,104.364,5.0,S,1
1,2022-06-10,104.114,5.0,D,1
2,2022-06-10,104.364,5.0,S,1
3,2022-06-10,104.364,2.5,S,1
4,2022-06-10,104.114,2.5,D,1
5,2022-06-10,104.114,5.0,D,1
6,2022-06-10,104.004,2.5,D,1
7,2022-06-10,104.064,2.5,D,1
8,2022-06-10,104.004,10.0,D,1
9,2022-06-10,104.064,10.0,D,1


In [27]:
df = add_same_day_flag(df, use_parallel_apply=False)
assert np.array_equal(df[IS_SAME_DAY].values, [False, False, False, False, False, False, False, False, False, False]), f'Received: {df[IS_SAME_DAY].values}'    # No dealer purchases
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip,is_same_day
0,2022-06-10,104.364,5.0,S,1,False
1,2022-06-10,104.114,5.0,D,1,False
2,2022-06-10,104.364,5.0,S,1,False
3,2022-06-10,104.364,2.5,S,1,False
4,2022-06-10,104.114,2.5,D,1,False
5,2022-06-10,104.114,5.0,D,1,False
6,2022-06-10,104.004,2.5,D,1,False
7,2022-06-10,104.064,2.5,D,1,False
8,2022-06-10,104.004,10.0,D,1,False
9,2022-06-10,104.064,10.0,D,1,False


In [28]:
df = pd.DataFrame({'trade_date': [datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14), 
                                  datetime(2022, 7, 14)], 
                   'price': [111.09,  
                             110.09, 
                             109.09, 
                             111.09,  
                             110.09, 
                             109.09], 
                   'par_traded': [100, 
                                100, 
                                100, 
                                100, 
                                100, 
                                100], 
                   'trade_type': ['S', 
                                  'D', 
                                  'P', 
                                  'S', 
                                  'D', 
                                  'P'], 
                   'cusip': [1, 1, 1, 1, 1, 1]})
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip
0,2022-07-14,111.09,100,S,1
1,2022-07-14,110.09,100,D,1
2,2022-07-14,109.09,100,P,1
3,2022-07-14,111.09,100,S,1
4,2022-07-14,110.09,100,D,1
5,2022-07-14,109.09,100,P,1


In [29]:
df = add_same_day_flag(df, use_parallel_apply=False)
assert np.array_equal(df[IS_SAME_DAY].values, [True, False, True, True, False, True]), f'Received: {df[IS_SAME_DAY].values}'
df

Unnamed: 0,trade_date,price,par_traded,trade_type,cusip,is_same_day
0,2022-07-14,111.09,100.0,S,1,True
1,2022-07-14,110.09,100.0,D,1,False
2,2022-07-14,109.09,100.0,P,1,True
3,2022-07-14,111.09,100.0,S,1,True
4,2022-07-14,110.09,100.0,D,1,False
5,2022-07-14,109.09,100.0,P,1,True


# Saving the file with all of the flags

In [None]:
if not os.path.isfile(processed_file_exclusions_all_flags_pickle):
    trade_data.to_pickle(processed_file_exclusions_all_flags_pickle)

# Explorations

## Same day flag
We know that dealers tend to sell in groups (e.g., CUSIP 232287EZ1, 07/18/2022). Do dealers also buy in groups? More specifically, are there many groups with the same CUSIP and the same trade date, where there are more dealer purchases than dealer sells?

In [None]:
%%time
groups = trade_data_original.groupby([pd.Grouper(key='trade_datetime', freq='1D'), 'cusip'])    # moving the grouper to the beginning to sort based on that
groups_more_purchases_than_sells = {group_key: group_df for group_key, group_df in groups if len(group_df[group_df['trade_type'] == 'S']) < len(group_df[group_df['trade_type'] == 'P'])}
print(f'Number of groups: {len(groups)}')
print(f'Number of groups with all three trade types: {len(groups_more_purchases_than_sells)}')
print(f'Percentage of groups with all three trade types: {len(groups_more_purchases_than_sells) / len(groups) * 100} %')

## NTBC precursor flag
How many non-transaction-based-compensation customer trades have none or multiple corresponding inter-dealer trades?

In [None]:
print(f'Number of trades marked with the `is_non_transaction_based_compensation` flag: {trade_data_original["is_non_transaction_based_compensation"].sum()}')
print(f'Percentage of trades marked with the `is_non_transaction_based_compensation` flag: {trade_data_original["is_non_transaction_based_compensation"].sum() / len(trade_data_original) * 100} %')
number_of_ntbc_trades_per_trade_type = trade_data_original[trade_data_original["is_non_transaction_based_compensation"] == True]["trade_type"].value_counts()
print(number_of_ntbc_trades_per_trade_type)
number_of_ntbc_trades_per_trade_type.plot.bar()

In [None]:
_, ntbc_multiple_candidates_dict = add_ntbc_precursor_flag(trade_data_original, NTBC_PRECURSOR, True)

In [None]:
for size in sorted(ntbc_multiple_candidates_dict.keys()):
    print(f'{size}: {len(ntbc_multiple_candidates_dict[size])}')

In [None]:
ntbc_multiple_candidates_dict[0]

In [None]:
trade_data_original[(trade_data_original['cusip'] == '549696RS3') & (trade_data['trade_datetime'].dt.date == pd.to_datetime('2022-04-01'))][['cusip', 'trade_datetime']]