# Compare Processed Trades

This notebook checks to see if trades priced with archived models are similar to the historical predictions table. The intent is simply to check if the pricing is similar, not to produce precisely the same results. The reason for minor discrepancy is that the historical predictions table, while supposed to have the results of the archived models, is sometimes not up-to-date since the archived models may need to be re-trained in the event of an issue with automated training and in this case, the model may be re-trained with slightly different training data and so could be slightly different.

In [None]:
import re
import os

import matplotlib.pyplot as plt
import pandas as pd

from google.cloud import bigquery

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../creds.json'
bq_client = bigquery.Client()

project = 'eng-reactor-287421'

In [None]:
# Specify the path to your CSV file
file_path = '/Users/user/desktop/BMO3.csv'

# Read the CSV into a pandas DataFrame
df = pd.read_csv(file_path)
df['trade_datetime'] = pd.to_datetime(df['trade_datetime'])
df['ytw'] = pd.to_numeric(df['ytw'], errors='coerce')

# Display the first few rows of the DataFrame to verify it's loaded correctly
print(df.head())
print(f'Length of dataframe at {file_path}: {len(df)}')

Here is the query for getting trades for a certain day: 


In [None]:
query = f'''SELECT * FROM ( SELECT
      a.cusip,
      a.trade_date,
      a.rtrs_control_number,
      a.dollar_price,
      a.yield AS msrb_reported_yield_in_bps,
      a.new_ficc_ycl + new_ys_prediction AS ficc_yield_prediction_in_bps,
      ABS((new_ficc_ycl + new_ys_prediction)-a.yield) AS prediction_error_in_bps
    FROM
      `eng-reactor-287421.historic_predictions.historical_predictions`a
    LEFT JOIN
      `auxiliary_views_v2.trade_history_same_issue_5_yr_mat_bucket_1_materialized` b
    ON
      a.rtrs_control_number = b.rtrs_control_number
    WHERE
      a.trade_date = "2024-02-20"
      AND
      b.calc_date > "2025-01-01") '''

In [None]:
# Specify the path to your CSV file
file_path = '/Users/user/desktop/bmo_cusips_3_21_2.csv'

# Read the CSV into a pandas DataFrame
df2 = pd.read_csv(file_path)
df2['trade_datetime'] = pd.to_datetime(df2['trade_datetime'])

# Display the first few rows of the DataFrame to verify it's loaded correctly
print(df2.head())
print(f'Length of dataframe at {file_path}: {len(df2)}')

Join the two dataframes on the RTRS control number. A left join is performed since `df2` most likely has fewer trades than `df` since we choose a subset of all of the trades to price when sending the data to customers.

In [None]:
assert len(df2) <= len(df)
merged_df = pd.merge(df2, df, on=['rtrs_control_number'], how='left')
print(f'Length of joined dataframe: {len(merged_df)}')

Remove all error trades for analysis.

In [None]:
merged_df = merged_df[merged_df['ytw'] > 0]

Create columns for analysis.

In [None]:
merged_df['new_prediction_in_bps'] = merged_df['ytw'] * 100
merged_df['new_prediction_error_in_bps'] = abs(merged_df['msrb_reported_yield_in_bps'] - (merged_df['new_prediction_in_bps']))

In [None]:
new_prediction_error_in_bps_mean = merged_df['new_prediction_error_in_bps'].mean()
prediction_error_in_bps_mean = merged_df['prediction_error_in_bps'].mean()
print(f'Mean of new_prediction_error_in_bps: {new_prediction_error_in_bps_mean}')
print(f'Mean of prediction_error_in_bps: {prediction_error_in_bps_mean}')

Change column names and values to send to customers.

In [None]:
merged_df['ficc_yield_prediction_in_bps'] = merged_df['ytw'] * 100
merged_df['cusip'] = merged_df['cusip_x']    # the '_x' suffix comes from joining the dataframes in the `pd.merge(...)`
merged_df['par_traded'] = merged_df['quantity_x']    # the '_x' suffix comes from joining the dataframes in the `pd.merge(...)`
merged_df['trade_datetime'] = merged_df['trade_datetime_x']    # the '_x' suffix comes from joining the dataframes in the `pd.merge(...)`
result_df = merged_df[['cusip', 'trade_datetime', 'rtrs_control_number', 'par_traded', 'trade_type', 'msrb_reported_yield_in_bps', 'ficc_yield_prediction_in_bps']]
result_df = result_df.sort_values(by=['trade_datetime'], ascending=True)

Create the CSV.

In [None]:
result_df.to_csv('2024-02-20_ficcai_predictions_for_msrb_trades.csv', index=False)

## Additional analysis

### Issues further down the dataframe
See if there is a change in error as the index increases in `merged_df`. This indicates an indexing issue with the data which is exacerbated further down the dataframe.

In [None]:
# Assuming merged_df is your DataFrame and it's already defined
n = 25  # Number of rows in each chunk

# Calculate the number of chunks
num_chunks = len(merged_df) // n + (1 if len(merged_df) % n else 0)

# Initialize a list to store the MAE for each chunk
mae_per_chunk = []

for i in range(num_chunks):
    start_row = i * n
    end_row = start_row + n
    # Calculate MAE for the current chunk and append to the list
    mae = merged_df['new_prediction_error_in_bps'][start_row:end_row].mean()
    mae_per_chunk.append(mae)

# Now, mae_per_chunk contains the MAE for each 1000-row chunk
# for i, mae in enumerate(mae_per_chunk, 1):
#     print(f'MAE for chunk {i}: {mae}')

In [None]:
plt.plot(mae_per_chunk, 'o')

In [None]:
merged_df_new_predictions_greater_than_0 = merged_df[merged_df["new_prediction_in_bps"] > 0]
merged_df_new_predictions_greater_than_0 = merged_df_new_predictions_greater_than_0.sort_values(by='new_prediction_error_in_bps', ascending=False)
merged_df_new_predictions_greater_than_0[['prediction_error_in_bps', "new_prediction_error_in_bps", 'ficc_yield_prediction_in_bps', "new_prediction_in_bps", 'msrb_reported_yield_in_bps', 'cusip_x', 'trade_datetime_x']].tail(50)

### RTRS control numbers with trades in the future
Some RTRS control numbers have trades in the history with a negative `num_seconds_ago` feature which implies that the trade is in the future. Investigate these.

Investigate `2024022007866600` specifically since this is causing issues.

In [None]:
merged_df_new_predictions_greater_than_0[merged_df_new_predictions_greater_than_0['rtrs_control_number'] == 2024022007866600][['prediction_error_in_bps', 'new_prediction_error_in_bps']]

In [None]:
# The path to your text file
file_path = '/Users/user/downloads/warnings.txt'

# This regular expression matches sequences of digits that appear to represent the RTRS control numbers.
# Adjust the pattern as necessary based on the actual format.
rtrs_pattern = r'RTRS control number (\d+)'

# Initialize an empty list to store the RTRS control numbers
rtrs_control_numbers = []

# Open the file and read line by line
with open(file_path, 'r') as file:
    for line in file:
        # Search for the pattern in the current line
        match = re.search(rtrs_pattern, line)
        if match:
            # If a match is found, extract the control number and add it to the list
            rtrs_control_numbers.append(match.group(1))

# Now, rtrs_control_numbers contains all the RTRS control numbers found in the file

rtrs_control_numbers = [int(number) for number in rtrs_control_numbers]
print(rtrs_control_numbers)
cusip_pattern = r'CUSIP (\w{9})'
cusips = []

# Open the file and read line by line
with open(file_path, 'r') as file:
    for line in file:
        # Search for the CUSIP pattern in the current line
        matches = re.findall(cusip_pattern, line)
        for match in matches:
            # If matches are found, add them to the list (avoiding duplicates)
            if match not in cusips:
                cusips.append(match)

# Now, cusips contains all the unique CUSIPs found in the file
print(cusips)

In [None]:
rtrs_control_numbers = ', '.join(str(number) for number in rtrs_control_numbers)

In [None]:
for control_number in rtrs_control_numbers:
    # Filtering to get the rows for the specific RTRS control number
    filtered_rows = merged_df_new_predictions_greater_than_0[merged_df_new_predictions_greater_than_0['rtrs_control_number'] == control_number][['prediction_error_in_bps', 'new_prediction_error_in_bps']]

    # Checking where the new prediction error is worse than the old
    worse_predictions = filtered_rows[(filtered_rows['new_prediction_error_in_bps'] - filtered_rows['prediction_error_in_bps']) >= 15]

    # Print the result if there are any worse predictions
    if not worse_predictions.empty:
        print(f"Worse predictions for RTRS control number {control_number}:\n{worse_predictions}\n")

In [None]:
def sqltodf(sql, limit=''):
    if limit != '': limit = f" ORDER BY RAND() LIMIT {limit}"
    bqr = bq_client.query(sql + limit).result()
    return bqr.to_dataframe()

In [None]:
cusips_from_msrb_trade_messages = sqltodf(f'''SELECT distinct(cusip) FROM MSRB.msrb_trade_messages where rtrs_control_number in({rtrs_control_numbers})''')

In [None]:
cusips_from_msrb_trade_messages = cusips_from_msrb_trade_messages.cusip.to_list()
print(f'Number of CUSIPs in `cusips_from_msrb_trade_messages`: {len(cusips_from_msrb_trade_messages)}')

In [None]:
common_cusips = set(cusips).intersection(set(cusips_from_msrb_trade_messages))
print('Common CUSIPs in both lists:', common_cusips)

In [None]:
# Assuming 'df' is your DataFrame

# Check for duplicated rows based on 'cusip', 'trade_datetime', and 'quantity'
duplicates_exist = df.duplicated(subset=['cusip', 'trade_datetime', 'quantity', 'trade_type'], keep=False).any()

if duplicates_exist:
    print("There are duplicate combinations of 'cusip', 'trade_datetime', and 'quantity'.")
else:
    print("All combinations of 'cusip', 'trade_datetime', and 'quantity' are unique.")

duplicate_rows = df[df.duplicated(subset=['cusip', 'trade_datetime', 'quantity'], keep=False)]
print(duplicate_rows)