Written by Gil, 7/30/2024

## Summary

This notebook is dedicated to checking date discrepancies in S&P reference data for municipal bonds. The primary focus is to evaluate discrepancies in the following date fields:

- **`maturity_date`**: The maturity date of the bond.
- **`accrual_date`**: The date from which interest accrues.
- **`first_coupon_date`**: The date of the first coupon payment.
- **`last_period_accrues_from_date`**: The date from which the last period's interest accrues.
- **`next_coupon_payment_date`**: The next coupon payment date.

### Key Objectives

1. **Check for mismatches in dates:**
   - Evaluate discrepancies in key date fields listed above.
   - Focus on metrics that consider both `CUSIPs` and `issue_keys` to determine if discrepancies are isolated or widespread within specific issues.

2. **Check misreads at the Issue level:**
   - Determine if discrepancies are concentrated within the same issue by comparing `issue_keys`.

3. **Analyze significant discrepancies:**
   - and distribution
   
4. **Higher-level pipeline restrictions:**
   - Implement additional restrictions to filter relevant data for evaluation.

### Key Steps
- **Review and document findings:** Complete evaluation for date discrepancies and proceed to other relevant fields as needed.

In [3]:
import os
from google.cloud import bigquery
import pandas as pd

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/gil/git/ficc/creds.json'
bq_client = bigquery.Client()

project = 'eng-reactor-287421'
dataset = 'jesse_tests'

def sqltodf(sql, limit=''):
    if limit != '': 
        limit = f' ORDER BY RAND() LIMIT {limit}'
    bqr = bq_client.query(sql + limit).result()
    return bqr.to_dataframe()

In [None]:
# Query to get distinct records with mismatched maturity dates
maturity_date_same_date_df = sqltodf(f'''
  SELECT
    DISTINCT ref_data_v2.cusip,
    ref_data_v2.maturity_date AS ref_data_v2_maturity_date,
    ref_data_v1.maturity_date AS ref_data_v1_maturity_date,
    ref_data_v2.interest_payment_frequency AS ref_data_v2_interest_payment_frequency,
    ref_data_v2.coupon_type AS ref_data_v2_coupon_type

  FROM
    eng-reactor-287421.jesse_tests.ref_data_v2_flat ref_data_v2
  INNER JOIN
    `reference_data_v1.reference_data_flat` ref_data_v1
  ON
    sp.cusip = ref_data_v1.cusip
  WHERE
    DATE(ref_data_v2.ref_valid_from_date) = DATE(ref_data_v1.ref_valid_from_date)
    AND ref_data_v2.coupon_type IN (8, 4, 10, 17)
    AND ref_data_v2.sale_type <> 4 -- no PPs etc
    AND ref_data_v2.sale_type <> 5
    AND ref_data_v2.sec_regulation IS NULL -- same
    AND ref_data_v2.maturity_date != ref_data_v1.maturity_date
''')

# Convert date columns to datetime
maturity_date_same_date_df['ref_data_v2_maturity_date'] = pd.to_datetime(maturity_date_same_date_df['sp_maturity_date'])
maturity_date_same_date_df['ref_data_v1_maturity_date'] = pd.to_datetime(maturity_date_same_date_df['ref_data_v1_maturity_date'])

# Calculate the delta in days
maturity_date_same_date_df['delta_days'] = (maturity_date_same_date_df['ref_data_v2_maturity_date'] - maturity_date_same_date_df['ref_data_v1_maturity_date']).dt.days

# Print the length of the DataFrame
print(f'Number of records with mismatched maturity dates: {len(maturity_date_same_date_df)}')

# Display delta days distribution
maturity_date_same_date_df['delta_days'].value_counts()


In [None]:
# Query to get distinct records with mismatched accrual dates
accrual_date_same_date_df = sqltodf(f'''
  SELECT
    DISTINCT ref_data_v2.cusip,
    ref_data_v2.accrual_date AS ref_data_v2_accrual_date,
    ref_data_v1.accrual_date AS ref_data_v1_accrual_date,
    ref_data_v2.interest_payment_frequency AS ref_data_v2_interest_payment_frequency,
    ref_data_v2.coupon_type AS sp_coupon_type
  FROM
    eng-reactor-287421.jesse_tests.sp_flat ref_data_v2
  INNER JOIN
    `reference_data_v1.reference_data_flat` ref_data_v1
  ON
    ref_data_v2.cusip = ref_data_v1.cusip
  WHERE
    DATE(ref_data_v2.ref_valid_from_date) = DATE(ref_data_v1.ref_valid_from_date)
    AND ref_data_v2.coupon_type IN (8, 4, 10, 17)
    AND ref_data_v2.sale_type <> 4 -- no PPs etc
    AND ref_data_v2.sale_type <> 5
    AND ref_data_v2.sec_regulation IS NULL -- same
    AND ref_data_v2.accrual_date != ref_data_v1.accrual_date
''')

# Convert date columns to datetime
accrual_date_same_date_df['ref_data_v2_accrual_date'] = pd.to_datetime(accrual_date_same_date_df['ref_data_v2_accrual_date'])
accrual_date_same_date_df['ref_data_v1_accrual_date'] = pd.to_datetime(accrual_date_same_date_df['ref_data_v1_accrual_date'])

# Calculate the delta in days
accrual_date_same_date_df['delta_days'] = (accrual_date_same_date_df['ref_data_v2_accrual_date'] - accrual_date_same_date_df['ref_data_v1_accrual_date']).dt.days

# Print the length of the DataFrame
print(f'Number of records with mismatched accrual dates: {len(accrual_date_same_date_df)}')

# Display delta days distribution
accrual_date_same_date_df['delta_days'].value_counts()


In [None]:
# Query to get distinct records with mismatched first coupon dates
first_coupon_date_same_date_df = sqltodf(f'''
  SELECT
    DISTINCT ref_data_v2.cusip,
    ref_data_v2.first_coupon_date AS sp_first_coupon_date,
    ref_data_v1.first_coupon_date AS ice_first_coupon_date,
    ref_data_v2.interest_payment_frequency AS sp_interest_payment_frequency,
    ref_data_v2.coupon_type AS sp_coupon_type
  FROM
    eng-reactor-287421.jesse_tests.sp_flat ref_data_v2
  INNER JOIN
    `reference_data_v1.reference_data_flat` ref_data_v1
  ON
    ref_data_v2.cusip = ref_data_v1.cusip
  WHERE
    DATE(ref_data_v2.ref_valid_from_date) = DATE(ref_data_v1.ref_valid_from_date)
    AND ref_data_v2.coupon_type IN (8, 4, 10, 17)
    AND ref_data_v2.sale_type <> 4 -- no PPs etc
    AND ref_data_v2.sale_type <> 5
    AND ref_data_v2.sec_regulation IS NULL -- same
    AND ref_data_v2.first_coupon_date != ref_data_v1.first_coupon_date
''')

# Convert date columns to datetime
first_coupon_date_same_date_df['ref_data_v2_first_coupon_date'] = pd.to_datetime(first_coupon_date_same_date_df['ref_data_v2_first_coupon_date'])
first_coupon_date_same_date_df['ref_data_v1_first_coupon_date'] = pd.to_datetime(first_coupon_date_same_date_df['ref_data_v1_first_coupon_date'])

# Calculate the delta in days
first_coupon_date_same_date_df['delta_days'] = (first_coupon_date_same_date_df['ref_data_v2_first_coupon_date'] - first_coupon_date_same_date_df['ref_data_v1_first_coupon_date']).dt.days

# Print the length of the DataFrame
print(f'Number of records with mismatched first coupon dates: {len(first_coupon_date_same_date_df)}')

# Display delta days distribution
first_coupon_date_same_date_df['delta_days'].value_counts()


In [None]:
# Query to get distinct records with mismatched last period accrues from dates
last_period_accrues_from_date_same_date_df = sqltodf(f'''
  SELECT
    DISTINCT ref_data_v2.cusip,
    ref_data_v2.last_period_accrues_from_date AS sp_last_period_accrues_from_date,
    ref_data_v1.last_period_accrues_from_date AS ref_data_v1_last_period_accrues_from_date,
    ref_data_v2.interest_payment_frequency AS sp_interest_payment_frequency,
    ref_data_v2.coupon_type AS sp_coupon_type
  FROM
    eng-reactor-287421.jesse_tests.sp_flat ref_data_v2
  INNER JOIN
    `reference_data_v1.reference_data_flat` ref_data_v1
  ON
    ref_data_v2.cusip = ref_data_v1.cusip
  WHERE
    DATE(ref_data_v2.ref_valid_from_date) = DATE(ref_data_v1.ref_valid_from_date)
    AND ref_data_v2.coupon_type IN (8, 4, 10, 17)
    AND ref_data_v2.sale_type <> 4 -- no PPs etc
    AND ref_data_v2.sale_type <> 5
    AND ref_data_v2.sec_regulation IS NULL -- same
    AND ref_data_v2.last_period_accrues_from_date != ref_data_v1.last_period_accrues_from_date
''')

# Convert date columns to datetime
last_period_accrues_from_date_same_date_df['ref_data_v2_last_period_accrues_from_date'] = pd.to_datetime(last_period_accrues_from_date_same_date_df['ref_data_v2_last_period_accrues_from_date'])
last_period_accrues_from_date_same_date_df['ref_data_v1_last_period_accrues_from_date'] = pd.to_datetime(last_period_accrues_from_date_same_date_df['ref_data_v1_last_period_accrues_from_date'])

# Calculate the delta in days
last_period_accrues_from_date_same_date_df['delta_days'] = (last_period_accrues_from_date_same_date_df['ref_data_v2_last_period_accrues_from_date'] - last_period_accrues_from_date_same_date_df['ref_data_v1_last_period_accrues_from_date']).dt.days

# Print the length of the DataFrame
print(f'Number of records with mismatched last period accrues from dates: {len(last_period_accrues_from_date_same_date_df)}')

# Display delta days distribution
last_period_accrues_from_date_same_date_df['delta_days'].value_counts()


In [None]:
# Query to get distinct records with mismatched next coupon payment dates
next_coupon_payment_date_same_date_df = sqltodf(f'''
  SELECT
    DISTINCT ref_data_v2.cusip,
    ref_data_v2.next_coupon_payment_date AS sp_next_coupon_payment_date,
    ref_data_v1.next_coupon_payment_date AS ice_next_coupon_payment_date,
    ref_data_v2.interest_payment_frequency AS sp_interest_payment_frequency,
    ref_data_v2.coupon_type AS sp_coupon_type
  FROM
    eng-reactor-287421.jesse_tests.sp_flat ref_data_v2
  INNER JOIN
    `reference_data_v1.reference_data_flat` ref_data_v1
  ON
    ref_data_v2.cusip = ref_data_v1.cusip
  WHERE
    DATE(ref_data_v2.ref_valid_from_date) = DATE(ref_data_v1.ref_valid_from_date)
    AND ref_data_v2.coupon_type IN (8, 4, 10, 17)
    AND ref_data_v2.sale_type <> 4 -- no PPs etc
    AND ref_data_v2.sale_type <> 5
    AND ref_data_v2.sec_regulation IS NULL -- same
    AND ref_data_v2.next_coupon_payment_date != ref_data_v1.next_coupon_payment_date
''')

# Convert date columns to datetime
next_coupon_payment_date_same_date_df['ref_data_v2_next_coupon_payment_date'] = pd.to_datetime(next_coupon_payment_date_same_date_df['ref_data_v2_next_coupon_payment_date'])
next_coupon_payment_date_same_date_df['ref_data_v1_next_coupon_payment_date'] = pd.to_datetime(next_coupon_payment_date_same_date_df['ref_data_v1_next_coupon_payment_date'])

# Calculate the delta in days
next_coupon_payment_date_same_date_df['delta_days'] = (next_coupon_payment_date_same_date_df['ref_data_v2_next_coupon_payment_date'] - next_coupon_payment_date_same_date_df['ref_data_v1_next_coupon_payment_date']).dt.days

# Print the length of the DataFrame
print(f'Number of records with mismatched next coupon payment dates: {len(next_coupon_payment_date_same_date_df)}')

# Display delta days distribution
next_coupon_payment_date_same_date_df['delta_days'].value_counts()
