### Create MAE Numbers 


This notebook creates the MAE numbers used to update the white paper.  A begin_date should be specified along with an optional end_date to determine the period covered by the MAE numbers.  We find that excluding trades six months to maturity and under $100k gives us the most representative accuracy numbers.  I tried to get the notebook to populate the numbers automatically in google sheets automatically, but I gave up and just copy pasted.

In [52]:
import os
import pandas as pd
import gspread

from datetime import datetime
from google.cloud import bigquery
from google.oauth2 import service_account
from oauth2client.service_account import ServiceAccountCredentials

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../creds.json"
bqclient = bigquery.Client()
project = "eng-reactor-287421"

In [53]:
# Define the scope
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']

creds = service_account.Credentials.from_service_account_file(os.environ["GOOGLE_APPLICATION_CREDENTIALS"])

# Authorize the client
client = gspread.authorize(creds)

In [12]:
def sqltodf(sql):
    bqr = bqclient.query(sql).result()
    return bqr.to_dataframe()

In [17]:
BEGIN_DATE = pd.to_datetime('2024-01-01').date()

In [18]:
END_DATE = datetime.now().date()

In [None]:
total_df = sqltodf(f'''
SELECT
  'total' AS category,
  AVG(ABS(new_ys_prediction-new_ys)) AS mae,
  COUNT(*) AS count
FROM
  `eng-reactor-287421.historic_predictions.historical_predictions` A
LEFT JOIN
  `auxiliary_views_v2.trade_history_same_issue_5_yr_mat_bucket_1_materialized` b
ON
  A.rtrs_control_number = b.rtrs_control_number
WHERE
  DATE_DIFF(calc_date, a.trade_date, day) > 180
  AND A.trade_date BETWEEN '{BEGIN_DATE}' AND '{END_DATE}'
  AND par_traded > 100000
''')

total_df.head()

Unnamed: 0,category,mae,count
0,total,8.490711,241005


In [None]:
ratings_df = sqltodf(f'''WITH investment_grade AS (
  SELECT
    A.new_ys_prediction,
    A.new_ys,
    b.sp_long_integer,
    b.moodys_long_integer
  FROM
    `eng-reactor-287421.historic_predictions.historical_predictions` A
  LEFT JOIN
    `auxiliary_views_v2.trade_history_same_issue_5_yr_mat_bucket_1_materialized` b
  ON
    A.rtrs_control_number = b.rtrs_control_number
  WHERE
    DATE_DIFF(b.calc_date, A.trade_date, DAY) > 180
    AND A.trade_date BETWEEN '{BEGIN_DATE}' AND '{END_DATE}'
    AND b.par_traded > 100000
    AND (b.sp_long_integer < 8 OR b.moodys_long_integer < 8)
),

AAA AS (
  SELECT
    A.new_ys_prediction,
    A.new_ys,
    b.sp_long_integer,
    b.moodys_long_integer
  FROM
    `eng-reactor-287421.historic_predictions.historical_predictions` A
  LEFT JOIN
    `auxiliary_views_v2.trade_history_same_issue_5_yr_mat_bucket_1_materialized` b
  ON
    A.rtrs_control_number = b.rtrs_control_number
  WHERE
    DATE_DIFF(b.calc_date, A.trade_date, DAY) > 180
    AND A.trade_date BETWEEN '{BEGIN_DATE}' AND '{END_DATE}'
    AND b.par_traded > 100000
    AND (b.sp_long_integer = 1 OR b.moodys_long_integer = 1)
),

unrated AS (
  SELECT
    A.new_ys_prediction,
    A.new_ys,
    b.sp_long_integer,
    b.moodys_long_integer
  FROM
    `eng-reactor-287421.historic_predictions.historical_predictions` A
  LEFT JOIN
    `auxiliary_views_v2.trade_history_same_issue_5_yr_mat_bucket_1_materialized` b
  ON
    A.rtrs_control_number = b.rtrs_control_number
  WHERE
    DATE_DIFF(b.calc_date, A.trade_date, DAY) > 180
    AND A.trade_date BETWEEN '{BEGIN_DATE}' AND '{END_DATE}'
    AND b.par_traded > 100000
    AND (b.sp_long_integer = 100 OR b.moodys_long_integer = 100)
)

SELECT
  'investment_grade' AS category,
  FORMAT('%0.2f', AVG(ABS(new_ys_prediction - new_ys))) AS mae,
  COUNT(*) AS count
FROM
  investment_grade

UNION ALL

SELECT
  'AAA' AS category,
  FORMAT('%0.2f', AVG(ABS(new_ys_prediction - new_ys))) AS mae,
  COUNT(*) AS count
FROM
  AAA

UNION ALL

SELECT
  'unrated' AS category,
  FORMAT('%0.2f', AVG(ABS(new_ys_prediction - new_ys))) AS mae,
  COUNT(*) AS count
FROM
  unrated;
''')

In [None]:
liquidity_buckets_df = sqltodf(f'''WITH trade_date_diff AS (
  SELECT
    A.new_ys_prediction,
    A.new_ys,
    ABS(A.new_ys_prediction - A.new_ys) AS absolute_error,
    DATE_DIFF(a.trade_date, DATE(b.recent[safe_offset(0)].trade_datetime), DAY) AS trade_date_difference
  FROM
    `eng-reactor-287421.historic_predictions.historical_predictions` A
  LEFT JOIN
    `auxiliary_views_v2.trade_history_same_issue_5_yr_mat_bucket_1_materialized` b
  ON
    A.rtrs_control_number = b.rtrs_control_number
  WHERE
    DATE_DIFF(b.calc_date, A.trade_date, DAY) > 180
    AND A.trade_date BETWEEN '{BEGIN_DATE}' AND '{END_DATE}'
    AND b.par_traded > 100000
)

SELECT
  'less_than_or_equal_to_1' AS category,
  AVG(ABS(new_ys_prediction - new_ys)) AS mae,
  COUNT(*) AS count
FROM
  trade_date_diff
WHERE
  trade_date_difference <= 1

UNION ALL

SELECT
  'between_1_and_5' AS category,
  AVG(ABS(new_ys_prediction - new_ys)) AS mae,
  COUNT(*) AS count
FROM
  trade_date_diff
WHERE
  trade_date_difference > 1 AND trade_date_difference <= 5

UNION ALL

SELECT
  'between_5_and_10' AS category,
  AVG(ABS(new_ys_prediction - new_ys)) AS mae,
  COUNT(*) AS count
FROM
  trade_date_diff
WHERE
  trade_date_difference > 5 AND trade_date_difference <= 10

UNION ALL

SELECT
  'greater_than_30' AS category,
  AVG(ABS(new_ys_prediction - new_ys)) AS mae,
  COUNT(*) AS count
FROM
  trade_date_diff
WHERE
  trade_date_difference > 30;
''')

In [None]:
side_df = sqltodf(f'''WITH mae_counts AS (
  SELECT
    CASE WHEN b.trade_type = "S" THEN ABS(new_ys_prediction - new_ys) END AS Offered_Side_abs_error,
    CASE WHEN b.trade_type = "P" THEN ABS(new_ys_prediction - new_ys) END AS bid_side_abs_error,
    CASE WHEN b.trade_type = "S" THEN 1 END AS Offered_Side_count,
    CASE WHEN b.trade_type = "P" THEN 1 END AS bid_side_count
  FROM
    `eng-reactor-287421.historic_predictions.historical_predictions` A
  LEFT JOIN
    `auxiliary_views_v2.trade_history_same_issue_5_yr_mat_bucket_1_materialized` b
  ON
    A.rtrs_control_number = b.rtrs_control_number
  WHERE
    DATE_DIFF(b.calc_date, a.trade_date, DAY) > 180
    AND a.trade_date BETWEEN '{BEGIN_DATE}' AND '{END_DATE}'
    AND b.par_traded > 100000
)

SELECT
  'Offered_Side' AS category,
  AVG(Offered_Side_abs_error) AS mae,
  SUM(Offered_Side_count) AS count
FROM
  mae_counts
WHERE
  Offered_Side_abs_error IS NOT NULL

UNION ALL

SELECT
  'bid_side' AS category,
  AVG(bid_side_abs_error) AS mae,
  SUM(bid_side_count) AS count
FROM
  mae_counts
WHERE
  bid_side_abs_error IS NOT NULL;
''')

In [39]:
ratings_df = ratings_df.sort_values(by='mae', ascending=True)
liquidity_buckets_df = liquidity_buckets_df.sort_values(by='mae', ascending=True)
side_df = side_df.sort_values(by='mae', ascending=True)

In [80]:
concatenated_df = pd.concat([total_df,ratings_df, liquidity_buckets_df,side_df ], ignore_index=True)

# Print the concatenated DataFrame
print(concatenated_df)

                  category        mae   count
0                    total   8.490711  241005
1                      AAA       7.17   53701
2         investment_grade       7.84  227463
3                  unrated       9.26  114161
4  less_than_or_equal_to_1   5.472683  128599
5          between_1_and_5   7.842905   27250
6         between_5_and_10   8.867875   16251
7          greater_than_30  16.768217   22847
8             Offered_Side   8.159266  101842
9                 bid_side  10.088977   57658


In [81]:
# Convert the 'mae' column to numeric type
concatenated_df['mae'] = pd.to_numeric(concatenated_df['mae'], errors='coerce')

# Round the values in the 'mae' column to two decimal points
concatenated_df['mae'] = concatenated_df['mae'].round(2)



In [82]:
# Define the mapping dictionary
mapping = {
    'total' : "Total Trades",
    'less_than_or_equal_to_1': 'Traded in last 1 day',
    'between_1_and_5': 'Traded more than one, but less than 5 days ago',
    'between_5_and_10': 'Traded more than 5, but less than 10 days ago',
    'greater_than_10': 'Traded more than 10, but less than 30 days ago',
    'greater_than_30': 'Traded more than 30 days ago',
    'AAA': 'AAA',
    'investment_grade': 'Investment Grade',
    'unrated': 'Unrated',
    'Offered_Side': 'Offered Side',  # Updated mapping for Offered Side
    'bid_side': 'Bid Side'  # Updated mapping for Bid Side
}

# Replace the category names using the mapping dictionary
concatenated_df['category'] = concatenated_df['category'].replace(mapping)


In [83]:
concatenated_df

Unnamed: 0,category,mae,count
0,Total Trades,8.49,241005
1,AAA,7.17,53701
2,Investment Grade,7.84,227463
3,Unrated,9.26,114161
4,Traded in last 1 day,5.47,128599
5,"Traded more than one, but less than 5 days ago",7.84,27250
6,"Traded more than 5, but less than 10 days ago",8.87,16251
7,Traded more than 30 days ago,16.77,22847
8,Offered Side,8.16,101842
9,Bid Side,10.09,57658


In [84]:
# Define the mapping for subcategories
category_mapping = {
    'Traded in last 1 day': 'Liquidity',
    'Traded more than one, but less than 5': 'Liquidity',
    'Traded more than 5, but less than 10': 'Liquidity',
    'Traded more than 10, but less than 30': 'Liquidity',
    'Traded more than 30 days ago': 'Liquidity',
    'AAA': 'Market',
    'Investment Grade': 'Market',
    'Unrated': 'Market',
    'Offered Side': 'Trade Type',
    'Bid Side': 'Trade Type'
}

# Create a new 'subcategory' column based on the mapping
concatenated_df['supercategory'] = concatenated_df['category'].map(category_mapping)
concatenated_df

Unnamed: 0,category,mae,count,supercategory
0,Total Trades,8.49,241005,
1,AAA,7.17,53701,Market
2,Investment Grade,7.84,227463,Market
3,Unrated,9.26,114161,Market
4,Traded in last 1 day,5.47,128599,Liquidity
5,"Traded more than one, but less than 5 days ago",7.84,27250,
6,"Traded more than 5, but less than 10 days ago",8.87,16251,
7,Traded more than 30 days ago,16.77,22847,Liquidity
8,Offered Side,8.16,101842,Trade Type
9,Bid Side,10.09,57658,Trade Type
