In [None]:
import pandas as pd
import gc
import psutil
import os

# # Check available memory first
# print(f"Available RAM: {psutil.virtual_memory().available / 1e9:.1f} GB")
# print(f"Total RAM: {psutil.virtual_memory().total / 1e9:.1f} GB")

# # File path
# file_path = "/Users/gil/Downloads/processed_data_yield_spread_with_similar_trades_v2_original.pkl"
# print(f"File size: {os.path.getsize(file_path) / 1e9:.1f} GB")

# # Force garbage collection settings
# gc.collect()
# gc.disable()  # Disable during load to prevent slowdown

# try:
#     print("Starting to load pickle...")
#     df = pd.read_pickle(file_path)
#     print("Successfully loaded!")
#     print(f"DataFrame shape: {df.shape}")

#     print(f"DataFrame memory usage: {df.memory_usage(deep=True).sum() / 1e9:.1f} GB")
# except MemoryError:
#     print("Not enough memory! The file is too large.")
# except Exception as e:
#     print(f"Error: {e}")
# finally:
#     gc.enable()

In [36]:
# df[df.trade_date == "2025-01-08"].to_csv("20250108_original.csv") 

### Start Here:

In [37]:
df_scratch = pd.read_csv("20250108_scratch.csv")
df_original = pd.read_csv("20250108_original.csv")

In [48]:
# First, create merged_all to get total matched records
merged_all = pd.merge(
    df_scratch,
    df_original,
    on='rtrs_control_number',
    suffixes=('_scratch', '_original'),
    how='inner'
)

print(f"Total records matched on rtrs_control_number: {len(merged_all):,}")

# Now merge only records with matching transaction types
merged_same_type = pd.merge(
    df_scratch,
    df_original,
    on=['rtrs_control_number', 'transaction_type'],
    suffixes=('_scratch', '_original'),
    how='inner'
)

print(f"Records with matching transaction types: {len(merged_same_type):,}")
print(f"This represents {len(merged_same_type)/len(merged_all)*100:.1f}% of all matched records")
print(f"Records with different transaction types: {len(merged_all) - len(merged_same_type):,}")

# Check column names in merged_same_type
print("\nColumns in merged_same_type:")
print([col for col in merged_same_type.columns if 'transaction' in col])

# Now run your difference analysis on merged_same_type
# Generate the differences DataFrame
base_columns = [col for col in df_scratch.columns if col not in ['rtrs_control_number', 'transaction_type']]
differences_same_type = []

for idx, row in merged_same_type.iterrows():
    rtrs_num = row['rtrs_control_number']
    trans_type = row['transaction_type']  # This is the merged column, not suffixed
    
    for col in base_columns:
        col_scratch = f"{col}_scratch"
        col_original = f"{col}_original"
        
        # Check if the columns exist in the merged dataframe
        if col_scratch in row.index and col_original in row.index:
            val_scratch = row[col_scratch]
            val_original = row[col_original]
            
            # Handle NaN comparison
            if pd.isna(val_scratch) and pd.isna(val_original):
                continue
            elif pd.isna(val_scratch) or pd.isna(val_original):
                differences_same_type.append({
                    'rtrs_control_number': rtrs_num,
                    'transaction_type': trans_type,
                    'column': col,
                    'scratch_value': val_scratch,
                    'original_value': val_original,
                    'difference_type': 'NaN_mismatch'
                })
            elif val_scratch != val_original:
                differences_same_type.append({
                    'rtrs_control_number': rtrs_num,
                    'transaction_type': trans_type,
                    'column': col,
                    'scratch_value': val_scratch,
                    'original_value': val_original,
                    'difference_type': 'value_mismatch'
                })
    
    if idx % 10000 == 0:
        print(f"Processed {idx} records...")

diff_df_same_type = pd.DataFrame(differences_same_type)
print(f"\nTotal differences found in same-transaction-type comparison: {len(diff_df_same_type)}")

# Show breakdown by transaction type
print("\nDifferences by transaction type:")
if len(diff_df_same_type) > 0:
    print(diff_df_same_type['transaction_type'].value_counts())

# Show summary by column
print("\nColumns with most differences:")
if len(diff_df_same_type) > 0:
    print(diff_df_same_type['column'].value_counts().head(20))

Total records matched on rtrs_control_number: 60,175
Records with matching transaction types: 59,978
This represents 99.7% of all matched records
Records with different transaction types: 197

Columns in merged_same_type:
['is_non_transaction_based_compensation_scratch', 'transaction_type', 'is_non_transaction_based_compensation_original']
Processed 0 records...
Processed 10000 records...
Processed 20000 records...
Processed 30000 records...
Processed 40000 records...
Processed 50000 records...

Total differences found in same-transaction-type comparison: 100196

Differences by transaction type:
transaction_type
I    99923
M      273
Name: count, dtype: int64

Columns with most differences:
column
Unnamed: 0          59978
issue_price          4992
days_to_call         2600
days_to_par          2583
max_qty_qdiff        2036
P_min_ago_qdiff      1840
S_min_ago_qdiff      1707
min_ys_qdiff         1696
max_ys_qdiff         1652
days_to_maturity     1632
min_ago_qdiff        1594
D_min_a

In [49]:
# 1. Get the TOTAL number of records with ANY difference
print("COMPLETE DIFFERENCE SUMMARY")
print("="*80)

# Count unique records with any difference
records_with_any_diff = diff_df_I['rtrs_control_number'].nunique()
print(f"Total I-type records compared: {len(merged_I):,}")
print(f"Records with ANY difference: {records_with_any_diff:,}")
print(f"Records that are IDENTICAL: {len(merged_I) - records_with_any_diff:,}")
print(f"Percentage with differences: {records_with_any_diff/len(merged_I)*100:.1f}%")

# 2. Show distribution of number of differences per record
print("\n\nDISTRIBUTION OF DIFFERENCES PER RECORD")
print("="*80)

diff_counts_per_record = diff_df_I['rtrs_control_number'].value_counts()
print(f"Records with 1-5 differences: {len(diff_counts_per_record[diff_counts_per_record <= 5]):,}")
print(f"Records with 6-10 differences: {len(diff_counts_per_record[(diff_counts_per_record > 5) & (diff_counts_per_record <= 10)]):,}")
print(f"Records with 11-20 differences: {len(diff_counts_per_record[(diff_counts_per_record > 10) & (diff_counts_per_record <= 20)]):,}")
print(f"Records with 21-30 differences: {len(diff_counts_per_record[(diff_counts_per_record > 20) & (diff_counts_per_record <= 30)]):,}")
print(f"Records with >30 differences: {len(diff_counts_per_record[diff_counts_per_record > 30]):,}")

# 3. Show the most common differences
print("\n\nMOST COMMON COLUMN DIFFERENCES")
print("="*80)
col_diff_summary = diff_df_I['column'].value_counts()
print(col_diff_summary.head(20))

# 4. Show examples of records with MANY differences (not just significant ones)
print("\n\nEXAMPLES OF RECORDS WITH MANY DIFFERENCES (ANY SIZE)")
print("="*80)

# Get top 10 records by total number of differences
top_diff_records = diff_counts_per_record.head(10)

for i, (rtrs_num, diff_count) in enumerate(top_diff_records.items()):
    print(f"\n{i+1}. RTRS: {rtrs_num} - {diff_count} total differences")
    
    # Get record details
    scratch_row = df_scratch_I[df_scratch_I['rtrs_control_number'] == rtrs_num].iloc[0]
    original_row = df_original_I[df_original_I['rtrs_control_number'] == rtrs_num].iloc[0]
    
    print(f"   CUSIP: {scratch_row['cusip']}")
    print(f"   Yield: {scratch_row['yield']}")
    
    # Show a sample of differences
    record_diffs = diff_df_I[diff_df_I['rtrs_control_number'] == rtrs_num]
    print("   Sample differences:")
    for _, diff in record_diffs.head(10).iterrows():
        print(f"     {diff['column']:25} | S: {str(diff['scratch_value'])[:20]:20} | O: {str(diff['original_value'])[:20]}")

# 5. Focus on specific important columns
print("\n\nSPECIFIC COLUMN ANALYSIS")
print("="*80)

important_cols = ['yield', 'yield_spread', 'ficc_ycl', 'dollar_price', 'par_traded', 
                  'treasury_rate', 'quantity', 'last_yield_spread', 'max_ys_ys']

for col in important_cols:
    col_diffs = diff_df_I[diff_df_I['column'] == col]
    if len(col_diffs) > 0:
        print(f"\n{col}: {len(col_diffs):,} records with differences")
        
        # Show percentage
        pct = len(col_diffs) / len(merged_I) * 100
        print(f"  ({pct:.2f}% of all I-type records)")

# 6. Check if certain records are COMPLETELY different
print("\n\nRECORDS WITH MOST EXTENSIVE CHANGES")
print("="*80)

# Define what percentage of columns need to differ to be "extensively different"
total_columns = len(base_columns)
extensively_different = diff_counts_per_record[diff_counts_per_record > total_columns * 0.2]  # >20% of columns

print(f"Records with >20% of columns different: {len(extensively_different)}")
if len(extensively_different) > 0:
    print("\nExamples:")
    for rtrs_num in extensively_different.head(5).index:
        diff_count = diff_counts_per_record[rtrs_num]
        pct_diff = diff_count / total_columns * 100
        print(f"  RTRS {rtrs_num}: {diff_count} differences ({pct_diff:.1f}% of columns)")

COMPLETE DIFFERENCE SUMMARY
Total I-type records compared: 59,815
Records with ANY difference: 59,815
Records that are IDENTICAL: 0
Percentage with differences: 100.0%


DISTRIBUTION OF DIFFERENCES PER RECORD
Records with 1-5 differences: 57,743
Records with 6-10 differences: 1,726
Records with 11-20 differences: 315
Records with 21-30 differences: 27
Records with >30 differences: 4


MOST COMMON COLUMN DIFFERENCES
column
Unnamed: 0          59815
issue_price          4985
days_to_call         2583
days_to_par          2566
max_qty_qdiff        2026
P_min_ago_qdiff      1834
S_min_ago_qdiff      1702
min_ys_qdiff         1694
max_ys_qdiff         1648
days_to_maturity     1625
min_ago_qdiff        1589
D_min_ago_qdiff      1559
min_ago_ago          1131
P_min_ago_ago        1118
D_min_ago_ago        1097
max_qty_ago          1087
min_ys_ago           1050
max_ys_ago           1022
S_min_ago_ago         987
issue_amount          978
Name: count, dtype: int64


EXAMPLES OF RECORDS WITH M

In [50]:
print(df_scratch[df_scratch.rtrs_control_number == 2025010801877200].to_markdown())

|       |   Unnamed: 0 |   rtrs_control_number | cusip     |   yield | is_callable   |   refund_date | accrual_date   | dated_date   |   next_sink_date |   coupon | delivery_date   | trade_date   | trade_datetime      | par_call_date   | interest_payment_frequency   | is_called   | is_non_transaction_based_compensation   | is_general_obligation   | callable_at_cav   | extraordinary_make_whole_call   | make_whole_call   | has_unexpired_lines_of_credit   | escrow_exists   | incorporated_state_code   | trade_type   |   par_traded | maturity_date   | settlement_date   | next_call_date   |   issue_amount |   maturity_amount |   issue_price |   orig_principal_amount |   max_amount_outstanding |   dollar_price | calc_date   |   purpose_sub_class |   called_redemption_type |   calc_day_cat | previous_coupon_payment_date   | instrument_primary_name                           |   purpose_class |   call_timing |   call_timing_in_part |   sink_frequency |   sink_amount_type | issue_text            

In [51]:
print(df_original[df_original.rtrs_control_number == 2025010801877200].to_markdown())

|       |   Unnamed: 0 |   rtrs_control_number | cusip     |   yield | is_callable   |   refund_date | accrual_date   | dated_date   |   next_sink_date |   coupon | delivery_date   | trade_date   | trade_datetime      | par_call_date   | interest_payment_frequency   | is_called   | is_non_transaction_based_compensation   | is_general_obligation   | callable_at_cav   | extraordinary_make_whole_call   | make_whole_call   | has_unexpired_lines_of_credit   | escrow_exists   | incorporated_state_code   | trade_type   |   par_traded | maturity_date   | settlement_date   | next_call_date   |   issue_amount |   maturity_amount |   issue_price |   orig_principal_amount |   max_amount_outstanding |   dollar_price | calc_date   |   purpose_sub_class |   called_redemption_type |   calc_day_cat | previous_coupon_payment_date   | instrument_primary_name                           |   purpose_class |   call_timing |   call_timing_in_part |   sink_frequency |   sink_amount_type | issue_text            