In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df_old = pd.read_pickle('/Users/gil/git/ficc_python/notebooks/train_model/gilmac_2024-09-18_2024-08-20_old_yield_spread_with_similar_trades.pkl')
df_old.head()

df_new = pd.read_pickle('/Users/gil/Downloads/2024-09-18_2024-08-20_processed_data_v3.pkl')
df_new.head()

# Merge on rtrs_control_number
df_merged = pd.merge(
    df_old, 
    df_new, 
    on='rtrs_control_number', 
    suffixes=('_old', '_new'),
    how='inner'
)

Unnamed: 0,rtrs_control_number,cusip,yield,is_callable,refund_date,accrual_date,dated_date,next_sink_date,coupon,delivery_date,...,D_min_ago_ago,D_min_ago_qdiff,P_min_ago_ys,P_min_ago_ttypes,P_min_ago_ago,P_min_ago_qdiff,S_min_ago_ys,S_min_ago_ttypes,S_min_ago_ago,S_min_ago_qdiff
0,2024091815398500,53340CBH3,316.9,True,NaT,2023-04-12,2023-04-12,NaT,5.0,2023-04-12,...,5.895651,5.352185,20.708705,PP,5.895351,5.352185,22.930882,SP,4.527656,0.0
1,2024091815398600,914318T59,373.9,True,NaT,2020-04-01,2020-04-01,2046-04-01,5.0,2020-04-01,...,4.415841,5.000005,66.617513,SP,4.414689,5.000005,66.617513,SP,4.414689,5.000005
2,2024091815398700,51771FAU0,418.4,True,NaT,2018-10-09,2018-10-09,2044-07-01,4.0,2018-10-09,...,4.057856,0.0,39.88311,PP,5.72073,4.17612,140.174509,SP,4.057856,0.0
3,2024091815233000,850752SL1,301.0,True,NaT,2019-03-28,2019-03-28,NaT,3.0,2019-03-28,...,4.078566,0.02321,25.271729,DS,4.078566,0.02321,14.471729,SS,4.078566,0.02321
4,2024091815233100,850752SL1,311.8,True,NaT,2019-03-28,2019-03-28,NaT,3.0,2019-03-28,...,4.078566,0.02321,25.271729,DD,4.078566,0.02321,14.471729,SD,4.078566,0.02321


In [4]:
print(f"\nMerged DataFrame shape: {df_merged.shape}")
print(f"Number of matching rtrs_control_numbers: {len(df_merged)}")

# Function to check if difference is significant (not just rounding error)
def is_significant_difference(old_val, new_val, threshold=0.01):
    """Check if numeric difference is significant (not just rounding)"""
    if pd.isna(old_val) and pd.isna(new_val):
        return False
    if pd.isna(old_val) != pd.isna(new_val):
        return True
    if isinstance(old_val, (int, float)) and isinstance(new_val, (int, float)):
        return abs(old_val - new_val) > threshold
    return old_val != new_val

# Function to compare arrays for significant differences
def arrays_significantly_different(arr1, arr2, threshold=0.01):
    """Check if arrays have significant differences"""
    if not isinstance(arr1, (list, np.ndarray)) or not isinstance(arr2, (list, np.ndarray)):
        return True
    
    arr1 = np.array(arr1)
    arr2 = np.array(arr2)
    
    if arr1.shape != arr2.shape:
        return True
    
    # Check if any element has significant difference
    if arr1.size == 0 and arr2.size == 0:
        return False
    
    diff = np.abs(arr1 - arr2)
    return np.any(diff > threshold)

# Check issue_amount for SIGNIFICANT differences
print("\n=== SIGNIFICANT ISSUE_AMOUNT DIFFERENCES ===")
issue_diff_mask = []
for idx, row in df_merged.iterrows():
    if is_significant_difference(row['issue_amount_old'], row['issue_amount_new'], threshold=0.1):
        issue_diff_mask.append(True)
    else:
        issue_diff_mask.append(False)

issue_diff = df_merged[issue_diff_mask]
print(f"Found {len(issue_diff)} SIGNIFICANT differences in issue_amount (> 0.1 change)")

# Show examples sorted by magnitude of difference
if len(issue_diff) > 0:
    issue_diff['diff_magnitude'] = abs(issue_diff['issue_amount_new'] - issue_diff['issue_amount_old'])
    issue_diff_sorted = issue_diff.sort_values('diff_magnitude', ascending=False)
    
    for i, row in issue_diff_sorted.head(10).iterrows():
        print(f"\nRTRS: {row['rtrs_control_number']}")
        print(f"  Old issue_amount: {row['issue_amount_old']:.2f}")
        print(f"  New issue_amount: {row['issue_amount_new']:.2f}")
        print(f"  Difference: {row['issue_amount_new'] - row['issue_amount_old']:.2f}")
        print(f"  Percent change: {((row['issue_amount_new'] - row['issue_amount_old']) / row['issue_amount_old'] * 100):.1f}%")

# Check trade history arrays for SIGNIFICANT differences
print("\n\n=== SIGNIFICANT TRADE_HISTORY DIFFERENCES ===")
trade_diff_examples = []
for idx, row in df_merged.iterrows():
    old_val = row['trade_history_old']
    new_val = row['trade_history_new']
    
    if arrays_significantly_different(old_val, new_val, threshold=0.1):
        trade_diff_examples.append((row['rtrs_control_number'], old_val, new_val))
    
    if len(trade_diff_examples) >= 5:
        break

print(f"Showing examples of significant trade_history differences:")
for rtrs, old_val, new_val in trade_diff_examples[:3]:
    print(f"\nRTRS: {rtrs}")
    print(f"  Old shape: {np.array(old_val).shape if isinstance(old_val, (list, np.ndarray)) else 'Not array'}")
    print(f"  New shape: {np.array(new_val).shape if isinstance(new_val, (list, np.ndarray)) else 'Not array'}")
    if isinstance(old_val, (list, np.ndarray)) and isinstance(new_val, (list, np.ndarray)):
        old_arr = np.array(old_val)
        new_arr = np.array(new_val)
        if old_arr.shape == new_arr.shape:
            diff = np.abs(old_arr - new_arr)
            max_diff_idx = np.unravel_index(np.argmax(diff), diff.shape)
            print(f"  Max difference: {diff[max_diff_idx]:.2f} at position {max_diff_idx}")
            print(f"  Old value at max diff: {old_arr[max_diff_idx]:.2f}")
            print(f"  New value at max diff: {new_arr[max_diff_idx]:.2f}")

# Check similar_trade_history for significant differences
print("\n\n=== SIGNIFICANT SIMILAR_TRADE_HISTORY DIFFERENCES ===")
similar_diff_examples = []
for idx, row in df_merged.iterrows():
    old_val = row['similar_trade_history_old']
    new_val = row['similar_trade_history_new']
    
    if arrays_significantly_different(old_val, new_val, threshold=0.1):
        similar_diff_examples.append((row['rtrs_control_number'], old_val, new_val))
    
    if len(similar_diff_examples) >= 5:
        break

print(f"Showing examples of significant similar_trade_history differences:")
for rtrs, old_val, new_val in similar_diff_examples[:3]:
    print(f"\nRTRS: {rtrs}")
    if isinstance(old_val, (list, np.ndarray)) and isinstance(new_val, (list, np.ndarray)):
        old_arr = np.array(old_val)
        new_arr = np.array(new_val)
        print(f"  Old shape: {old_arr.shape}")
        print(f"  New shape: {new_arr.shape}")
        if old_arr.shape == new_arr.shape:
            diff = np.abs(old_arr - new_arr)
            max_diff_idx = np.unravel_index(np.argmax(diff), diff.shape)
            print(f"  Max difference: {diff[max_diff_idx]:.2f} at position {max_diff_idx}")

# Look for text/categorical differences
print("\n\n=== TEXT/CATEGORICAL FIELD DIFFERENCES ===")
text_fields = ['issue_text', 'instrument_primary_name', 'series_name', 'issuer_name', 'state']

for field in text_fields:
    if f'{field}_old' in df_merged.columns and f'{field}_new' in df_merged.columns:
        diff_mask = df_merged[f'{field}_old'] != df_merged[f'{field}_new']
        # Handle NaN comparisons
        diff_mask = diff_mask | (df_merged[f'{field}_old'].isna() != df_merged[f'{field}_new'].isna())
        
        diff_df = df_merged[diff_mask]
        if len(diff_df) > 0:
            print(f"\n{field.upper()}: {len(diff_df)} differences")
            for i, row in diff_df.head(3).iterrows():
                print(f"  RTRS: {row['rtrs_control_number']}")
                print(f"    Old: '{row[f'{field}_old']}'")
                print(f"    New: '{row[f'{field}_new']}'")

# Date field differences
print("\n\n=== DATE FIELD DIFFERENCES (>1 day) ===")
date_fields = ['refund_date', 'last_refund_date', 'next_sink_date', 'previous_coupon_payment_date', 
               'next_coupon_payment_date', 'dated_date', 'accrual_date']

for field in date_fields:
    if f'{field}_old' in df_merged.columns and f'{field}_new' in df_merged.columns:
        # Convert to datetime if needed
        old_dates = pd.to_datetime(df_merged[f'{field}_old'], errors='coerce')
        new_dates = pd.to_datetime(df_merged[f'{field}_new'], errors='coerce')
        
        # Find differences > 1 day
        diff_days = abs((new_dates - old_dates).dt.days)
        significant_diff = diff_days > 1
        
        # Also check for NaT mismatches
        nat_mismatch = (old_dates.isna() != new_dates.isna())
        
        diff_mask = significant_diff | nat_mismatch
        diff_df = df_merged[diff_mask]
        
        if len(diff_df) > 0:
            print(f"\n{field}: {len(diff_df)} significant differences")
            for i, row in diff_df.head(2).iterrows():
                print(f"  RTRS: {row['rtrs_control_number']}")
                print(f"    Old: {row[f'{field}_old']}")
                print(f"    New: {row[f'{field}_new']}")

# Summary of biggest numeric differences
print("\n\n=== SUMMARY: FIELDS WITH LARGEST NUMERIC DIFFERENCES ===")
numeric_fields = ['days_to_par', 'days_to_call', 'avg_life', 'duration', 'issue_amount', 
                  'P_min_ago_ys', 'S_min_ago_ys', 'D_min_ago_ys']

for field in numeric_fields:
    if f'{field}_old' in df_merged.columns and f'{field}_new' in df_merged.columns:
        try:
            diff = abs(df_merged[f'{field}_new'] - df_merged[f'{field}_old'])
            max_diff = diff.max()
            if max_diff > 0.1:  # Only show if max difference > 0.1
                max_idx = diff.idxmax()
                row = df_merged.loc[max_idx]
                print(f"\n{field}:")
                print(f"  Max difference: {max_diff:.2f}")
                print(f"  RTRS: {row['rtrs_control_number']}")
                print(f"  Old: {row[f'{field}_old']:.2f}")
                print(f"  New: {row[f'{field}_new']:.2f}")
        except:
            pass


Merged DataFrame shape: (951750, 279)
Number of matching rtrs_control_numbers: 951750

=== SIGNIFICANT ISSUE_AMOUNT DIFFERENCES ===
Found 479391 SIGNIFICANT differences in issue_amount (> 0.1 change)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issue_diff['diff_magnitude'] = abs(issue_diff['issue_amount_new'] - issue_diff['issue_amount_old'])



RTRS: 2024090506085900
  Old issue_amount: 7.94
  New issue_amount: 0.00
  Difference: -7.94
  Percent change: -100.0%

RTRS: 2024090506086200
  Old issue_amount: 7.94
  New issue_amount: 0.00
  Difference: -7.94
  Percent change: -100.0%

RTRS: 2024090900624200
  Old issue_amount: 7.94
  New issue_amount: 0.00
  Difference: -7.94
  Percent change: -100.0%

RTRS: 2024090911192900
  Old issue_amount: 7.94
  New issue_amount: 0.00
  Difference: -7.94
  Percent change: -100.0%

RTRS: 2024090911190200
  Old issue_amount: 7.94
  New issue_amount: 0.00
  Difference: -7.94
  Percent change: -100.0%

RTRS: 2024090506086000
  Old issue_amount: 7.94
  New issue_amount: 0.00
  Difference: -7.94
  Percent change: -100.0%

RTRS: 2024090506073400
  Old issue_amount: 7.94
  New issue_amount: 0.00
  Difference: -7.94
  Percent change: -100.0%

RTRS: 2024090511634500
  Old issue_amount: 8.86
  New issue_amount: 3.70
  Difference: -5.16
  Percent change: -58.2%

RTRS: 2024090511627300
  Old issue_amoun

In [6]:
# Helper function to display side-by-side comparison
def show_side_by_side(rtrs, fields_to_show):
    """Display old vs new values for specified fields"""
    row = df_merged[df_merged['rtrs_control_number'] == rtrs].iloc[0]
    
    print(f"\n{'='*80}")
    print(f"RTRS: {rtrs}")
    print(f"{'='*80}")
    
    for field in fields_to_show:
        old_val = row.get(f'{field}_old', 'N/A')
        new_val = row.get(f'{field}_new', 'N/A')
        
        # Format values for better display
        if isinstance(old_val, float) and not pd.isna(old_val):
            old_str = f"{old_val:.2f}"
        elif pd.isna(old_val):
            old_str = "NaN"
        else:
            old_str = str(old_val)
            
        if isinstance(new_val, float) and not pd.isna(new_val):
            new_str = f"{new_val:.2f}"
        elif pd.isna(new_val):
            new_str = "NaN"
        else:
            new_str = str(new_val)
        
        print(f"\n{field}:")
        print(f"  OLD: {old_str}")
        print(f"  NEW: {new_str}")
        
        # Show difference for numeric fields
        if isinstance(old_val, (int, float)) and isinstance(new_val, (int, float)) and not pd.isna(old_val) and not pd.isna(new_val):
            diff = new_val - old_val
            if abs(diff) > 0.01:
                pct_change = (diff / old_val * 100) if old_val != 0 else float('inf')
                print(f"  DIFF: {diff:.2f} ({pct_change:+.1f}%)")

# 1. EXTREME ISSUE_AMOUNT CHANGES (100% loss)
print("\n" + "="*80)
print("EXAMPLE 1: ISSUE AMOUNT DROPPED TO ZERO")
print("="*80)

# Find examples where issue_amount went to 0
zero_issue_mask = (df_merged['issue_amount_old'] > 0) & (df_merged['issue_amount_new'] == 0)
zero_examples = df_merged[zero_issue_mask].head(3)

for idx, row in zero_examples.iterrows():
    show_side_by_side(
        row['rtrs_control_number'],
        ['issue_amount', 'cusip', 'instrument_primary_name', 'issue_text', 'series_name', 'maturity_date']
    )

# 2. LARGE ISSUE_AMOUNT REDUCTION (but not to zero)
print("\n\n" + "="*80)
print("EXAMPLE 2: LARGE ISSUE AMOUNT REDUCTION")
print("="*80)

# Find large reductions that aren't to zero
large_reduction_mask = (
    (df_merged['issue_amount_old'] > 0) & 
    (df_merged['issue_amount_new'] > 0) & 
    (abs(df_merged['issue_amount_old'] - df_merged['issue_amount_new']) > 1)
)
large_reduction = df_merged[large_reduction_mask].head(1)

for idx, row in large_reduction.iterrows():
    show_side_by_side(
        row['rtrs_control_number'],
        ['issue_amount', 'cusip', 'instrument_primary_name', 'issue_text', 'maturity_date', 'state']
    )

# 3. TEXT FIELD CHANGES
print("\n\n" + "="*80)
print("EXAMPLE 3: DRAMATIC TEXT FIELD CHANGES")
print("="*80)

# Find cases with significant text changes
text_change_mask = (
    (df_merged['issue_text_old'].str.len() < 20) & 
    (df_merged['issue_text_new'].str.len() > 30)
)
text_examples = df_merged[text_change_mask].head(2)

for idx, row in text_examples.iterrows():
    show_side_by_side(
        row['rtrs_control_number'],
        ['issue_text', 'instrument_primary_name', 'series_name', 'issuer_name']
    )

# 4. EXTREME YIELD SPREAD CHANGES
print("\n\n" + "="*80)
print("EXAMPLE 4: EXTREME YIELD SPREAD METRIC CHANGES")
print("="*80)

# Find extreme changes in yield spread metrics
extreme_ys_mask = (
    (abs(df_merged['P_min_ago_ys_new'] - df_merged['P_min_ago_ys_old']) > 1000) |
    (abs(df_merged['S_min_ago_ys_new'] - df_merged['S_min_ago_ys_old']) > 1000)
)
extreme_ys = df_merged[extreme_ys_mask].head(2)

for idx, row in extreme_ys.iterrows():
    show_side_by_side(
        row['rtrs_control_number'],
        ['P_min_ago_ys', 'S_min_ago_ys', 'D_min_ago_ys', 'issue_amount', 'yield']
    )

# 5. DATE FIELD CHANGES
print("\n\n" + "="*80)
print("EXAMPLE 5: SIGNIFICANT DATE CHANGES")
print("="*80)

# Find date field changes
date_cols = ['refund_date', 'next_coupon_payment_date', 'previous_coupon_payment_date']
for date_col in date_cols:
    # Find cases where dates differ significantly or NaT mismatch
    old_dates = pd.to_datetime(df_merged[f'{date_col}_old'], errors='coerce')
    new_dates = pd.to_datetime(df_merged[f'{date_col}_new'], errors='coerce')
    
    nat_mismatch = (old_dates.isna() != new_dates.isna())
    if nat_mismatch.any():
        example = df_merged[nat_mismatch].iloc[0]
        show_side_by_side(
            example['rtrs_control_number'],
            ['refund_date', 'last_refund_date', 'next_coupon_payment_date', 'previous_coupon_payment_date', 'maturity_date']
        )
        break

# 6. TRADE HISTORY ARRAY DIFFERENCES
print("\n\n" + "="*80)
print("EXAMPLE 6: TRADE HISTORY ARRAY CHANGES")
print("="*80)

# Find trade history differences
for idx, row in df_merged.head(1000).iterrows():
    old_th = row['trade_history_old']
    new_th = row['trade_history_new']
    
    if isinstance(old_th, np.ndarray) and isinstance(new_th, np.ndarray):
        if old_th.shape == new_th.shape and old_th.size > 0:
            diff = np.abs(old_th - new_th)
            if np.any(diff > 0.5):  # Find significant differences
                print(f"\nRTRS: {row['rtrs_control_number']}")
                print(f"Trade History Shape: {old_th.shape}")
                print("\nOLD trade_history:")
                print(old_th)
                print("\nNEW trade_history:")
                print(new_th)
                print("\nDIFFERENCE matrix:")
                print(diff)
                print(f"\nMax difference: {np.max(diff):.2f} at position {np.unravel_index(np.argmax(diff), diff.shape)}")
                
                # Also show related fields
                show_side_by_side(
                    row['rtrs_control_number'],
                    ['issue_amount', 'trade_count', 'yield']
                )
                break

# Summary statistics
print("\n\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)

# Count major difference categories
zero_issue_count = ((df_merged['issue_amount_old'] > 0) & (df_merged['issue_amount_new'] == 0)).sum()
text_expanded = ((df_merged['issue_text_old'].str.len() < 20) & (df_merged['issue_text_new'].str.len() > 30)).sum()
series_to_none = (df_merged['series_name_new'] == 'No series name').sum()
refund_date_removed = (~df_merged['refund_date_old'].isna() & df_merged['refund_date_new'].isna()).sum()

print(f"\nIssue amounts that went to zero: {zero_issue_count:,}")
print(f"Issue text expanded (short to long): {text_expanded:,}")
print(f"Series name changed to 'No series name': {series_to_none:,}")
print(f"Refund dates that were removed (became NaT): {refund_date_removed:,}")


EXAMPLE 1: ISSUE AMOUNT DROPPED TO ZERO

RTRS: 2024090900624200

issue_amount:
  OLD: 7.9377184
  NEW: 0.0

cusip:
  OLD: 34161HAV8
  NEW: 34161HAV8

instrument_primary_name:
  OLD: BDS 2019 A
  NEW: AGM Sunshine Skyway Revenue Bonds - Series 2019A

issue_text:
  OLD: BDS
  NEW: AGM Sunshine Skyway Revenue Bonds - Series 2019A

series_name:
  OLD: 2019 A
  NEW: No series name

maturity_date:
  OLD: 2038-07-01 00:00:00
  NEW: 2038-07-01 00:00:00

RTRS: 2024090911190200

issue_amount:
  OLD: 7.9377184
  NEW: 0.0

cusip:
  OLD: 34161HAV8
  NEW: 34161HAV8

instrument_primary_name:
  OLD: BDS 2019 A
  NEW: AGM Sunshine Skyway Revenue Bonds - Series 2019A

issue_text:
  OLD: BDS
  NEW: AGM Sunshine Skyway Revenue Bonds - Series 2019A

series_name:
  OLD: 2019 A
  NEW: No series name

maturity_date:
  OLD: 2038-07-01 00:00:00
  NEW: 2038-07-01 00:00:00

RTRS: 2024090911192900

issue_amount:
  OLD: 7.9377184
  NEW: 0.0

cusip:
  OLD: 34161HAV8
  NEW: 34161HAV8

instrument_primary_name:
  OLD:

In [2]:
import pandas as pd
df = pd.read_pickle('/Users/gil/git/ficc_python/notebooks/train_model/2025-06-17_2025-05-17_trades_for_all_dates_from_get_processed_data.pkl')
df.head()

Unnamed: 0,rtrs_control_number,cusip,yield,is_callable,refund_date,accrual_date,dated_date,next_sink_date,coupon,delivery_date,...,D_min_ago_ago,D_min_ago_qdiff,P_min_ago_ys,P_min_ago_ttypes,P_min_ago_ago,P_min_ago_qdiff,S_min_ago_ys,S_min_ago_ttypes,S_min_ago_ago,S_min_ago_qdiff
0,2025061720786800,112709ZM5,479.0,True,NaT,2021-10-28,2021-10-28,NaT,2.0,2021-10-28,...,4.28269,5.176094,64.320795,PP,1.531479,0.066212,42.620795,SP,4.28269,5.176094
1,2025061720786600,112709ZM5,479.0,True,NaT,2021-10-28,2021-10-28,NaT,2.0,2021-10-28,...,4.281942,5.176094,65.609381,PP,5.636037,4.69898,42.620795,SP,4.281942,5.176094
2,2025061720771000,9384292L0,457.7,True,NaT,2025-05-28,2025-05-28,NaT,0.0,2025-05-28,...,4.318919,0.045243,88.795466,PP,6.017536,4.903096,73.79918,SP,4.318919,0.045243
3,2025061720734200,846378EF0,492.3,True,NaT,2025-06-18,2025-06-18,2051-06-01,4.85,2025-06-18,...,4.012542,4.39796,-16.697114,PP,4.103359,4.698979,-17.197114,SP,4.115211,5.352185
4,2025061720633100,15114CDB7,555.1,True,NaT,2020-09-29,2020-09-01,2025-09-01,4.125,2020-09-29,...,5.57477,4.000046,93.232996,PS,5.57477,4.000046,71.465098,SS,5.853419,4.000041


In [7]:
df[df.trade_date == '2025-06-04'][:100].to_csv('/Users/gil/Downloads/2025-06-04_100_trades.csv')