In [1]:
'''
Author: Gil Shulman
Date: 2025-02-12
Last Editor: Gil Shulman
Last Edit Date: 2025-03-26
Description: RDID decoder
'''

import pandas as pd
import json
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/gil/git/ficc/creds.json'


In [44]:
"""
FINRA MBS (Mortgage-Backed Securities) RDID Decoder

This module decodes FINRA's Reference Data Identifier (RDID) fields into their component values.
RDIDs use a base-32 encoding system to represent various securities attributes in a compact format.
"""

# Base-32 character set used by FINRA: digits 0-9 and letters A-V
BASE32_CHARS = '0123456789ABCDEFGHIJKLMNOPQRSTUV'

# Lookup dictionaries for RDID components
AGENCY_MAP = {
    'F': 'Fannie Mae',
    'M': 'Freddie Mac', 
    'G': 'Ginnie Mae',
    'N': 'Ginnie Mae II',
    'S': 'SBA',
    'T': 'Test',
    'R': 'Ginnie Mae I - Serial Notes'
}

MORTGAGE_PRODUCT_MAP = {
    'C': 'Conventional',
    'H': 'Home Improvement Loans',
    'M': 'Manufactured Housing',
    'S': 'Specialized',
    '#': 'SBA or Unknown'
}

AMORTIZATION_MAP = {
    'A': 'ARM (Adjustable Rate Mortgage)',
    'B': 'Balloon',
    'G': 'Graduated Payment',
    'H': 'Hybrid ARM',
    'L': 'Level Payment',
    'T': 'Tiered Payment',
    'W': 'Weighted Average Coupon',
    'R': 'Rapid Pay',
    'V': 'Variable',
    'Y': 'Buy Down',
    'D': 'Discount'
}

# Field configurations for decoding the numeric fields
FIELD_CONFIG = {
    'coupon': {'max': 33.0, 'type': 'float', 'precision': 6},           # 0-32.999999%
    'original_maturity': {'max': 3290, 'type': 'int', 'rounding': 'up_to_10'},  # Months
    'wac': {'max': 33.0, 'type': 'float', 'precision': 1},              # 0-32.9%
    'wam': {'max': 3300, 'type': 'float'},                              # Months 
    'wala': {'max': 3290, 'type': 'int', 'rounding': 'up_to_10'},       # Months
    'average_loan_size': {'max': 3300000, 'type': 'int', 'rounding': 'down_to_25'}, # Dollars
    'ltv': {'max': 3300, 'type': 'int'}                                 # Percentage
}

def decode_rdid(rdid):
    """
    Decodes a FINRA RDID string into its component values.
    
    Args:
        rdid (str): The Reference Data Identifier string (e.g., "FLL2F1444070J06P0##")
        
    Returns:
        dict: Dictionary with decoded values for each RDID component
    """
    # Validate input
    rdid = rdid.strip() if rdid else ""
    if len(rdid) < 17:
        return {"error": f"Invalid RDID; need at least 17 chars, got {len(rdid)}. RDID={rdid}"}
    
    try:
        # Extract and decode each component from the RDID string
        result = {
            "issuing_agency": AGENCY_MAP.get(rdid[0], f"Unknown Agency ({rdid[0]})"),
            "mortgage_product": MORTGAGE_PRODUCT_MAP.get(rdid[1], f"Unknown Mortgage Product ({rdid[1]})"),
            "amortization_type": AMORTIZATION_MAP.get(rdid[2], f"Unknown Amortization Type ({rdid[2]})"),
            "coupon_rate": decode_numeric_field(rdid[3:5], 'coupon'),
            "original_maturity_months": decode_numeric_field(rdid[5:7], 'original_maturity'),
            "wac_weighted_average_coupon": decode_numeric_field(rdid[7:9], 'wac'),
            "wam_weighted_average_maturity_months": decode_numeric_field(rdid[9:11], 'wam'),
            "wala_weighted_average_loan_age_months": decode_numeric_field(rdid[11:13], 'wala'),
            "average_loan_size": decode_numeric_field(rdid[13:15], 'average_loan_size')/1140,
            "original_ltv_percent": decode_ltv_value(rdid[15:17]),
            "raw_rdid": rdid
        }
        
        return result
    except Exception as e:
        return {"error": f"Error decoding RDID: {str(e)}", "raw_rdid": rdid}

def decode_numeric_field(encoded_str, field_name):
    """
    Decodes 2-character encoded RDID fields to their numeric values.
    
    Args:
        encoded_str (str): 2-character encoded string (e.g., '30', '1G')
        field_name (str): Field name from FIELD_CONFIG keys
    
    Returns:
        float or int: Decoded numeric value, or empty string for special cases
    """
    # Handle special cases
    if encoded_str in ('##', '**'):
        return ""
    
    # Decode the base-32 value
    encoded_value = base32_decode(encoded_str)
    if encoded_value == "":
        return ""  # Invalid encoding

    # Get field configuration
    if field_name not in FIELD_CONFIG:
        return ""
        
    config = FIELD_CONFIG[field_name]
    max_value = config['max']
    
    # Calculate raw value (linear mapping from encoded range to field range)
    raw_value = (encoded_value / 1023.0) * max_value
    
    # Apply field-specific formatting/rounding
    if config['type'] == 'float':
        precision = config.get('precision', 6)
        return round(raw_value, precision)
    else:  # Integer type
        raw_int = int(round(raw_value))
        
        # Apply special rounding rules if specified
        if 'rounding' in config:
            if config['rounding'] == 'down_to_10':
                return (raw_int // 10) * 10
            elif config['rounding'] == 'up_to_10':
                return ((raw_int + 9) // 10) * 10
            elif config['rounding'] == 'down_to_25':
                return (raw_int // 25) * 25
        
        return raw_int

def base32_decode(encoded_str):
    """
    Decode a two-character base-32 encoded string to its numeric value.
    
    Args:
        encoded_str (str): Two-character string to decode
        
    Returns:
        int or str: Decoded value (0-1023), or empty string if invalid
    """
    if not encoded_str or len(encoded_str) != 2:
        return ""
        
    try:
        # Handle special cases
        if '*' in encoded_str or '#' in encoded_str:
            return ""
            
        # Convert each character to its base-32 value
        val1 = BASE32_CHARS.index(encoded_str[0])
        val2 = BASE32_CHARS.index(encoded_str[1])
        
        # Calculate the decoded value (first digit * 32 + second digit)
        return val1 * 32 + val2
    except ValueError:
        # Character not in base-32 character set
        return ""

def decode_ltv_value(encoded_str):
    """
    Decodes the LTV (Loan-to-Value) field using FINRA's segmentation.
    
    LTV ratios are bucketed into 8 categories per Regulatory Notice 21-02:
    - up to 20%: shown as 20%
    - 21% to 40%: shown as 40%
    - 41% to 60%: shown as 60%
    - 61% to 80%: shown as 80%
    - 81% to 93%: shown as 93%
    - 94% to 100%: shown as 100%
    - 101% to 120%: shown as 120%
    - 121%+: shown as 121+
    """
    # First decode the raw LTV value
    raw_ltv = decode_numeric_field(encoded_str, 'ltv')
    
    if raw_ltv == "":
        return ""
        
    try:
        # Apply FINRA's LTV buckets
        if raw_ltv <= 20:
            return 20
        elif raw_ltv <= 40:
            return 40
        elif raw_ltv <= 60:
            return 60
        elif raw_ltv <= 80:
            return 80
        elif raw_ltv <= 93:
            return 93
        elif raw_ltv <= 100:
            return 100
        elif raw_ltv <= 120:
            return 120
        else:
            return 121  # 121+ shown as 121
    except TypeError:
        # Handle case where raw_ltv is not comparable
        return ""

In [45]:
# Read original data
df = pd.read_csv('mbs_data_complete.csv')

# Decode RDIDs
rdids = []
for index, row in df.iterrows():
    rdid = row['rdid']
    decoded = decode_rdid(rdid)
    
    # Add original row values with exact column names
    result = {
        'rdid': rdid,
        'orig_agency': row.get('issuingAgency'),
        'decoded_agency': decoded.get('issuing_agency'),
        'orig_mortgage_product': row.get('mortgage_product'),  # Changed from mortgageProduct
        'decoded_mortgage_product': decoded.get('mortgage_product'),
        'orig_amortization_type': row.get('amortization'),    # Changed from amortizationType
        'decoded_amortization_type': decoded.get('amortization_type'),
        'orig_coupon_rate': row.get('coupon_rate'),          # Changed from couponRate
        'decoded_coupon_rate': decoded.get('coupon_rate'),
        'orig_maturity': row.get('maturity_date'),           # Changed from originalMaturity
        'decoded_maturity': decoded.get('original_maturity'),
        'orig_wac': row.get('wac_weighted_average_coupon'),
        'decoded_wac': decoded.get('wac_weighted_average_coupon'),
        'orig_wam': row.get('wam_weighted_average_maturity_months'),
        'decoded_wam': decoded.get('wam_weighted_average_maturity_months'),
        'orig_wala': row.get('wala_weighted_average_loan_age_months'),
        'decoded_wala': decoded.get('wala_weighted_average_loan_age_months'),
        'orig_symbol': row.get('Symbol'),                    # Added symbol
        'orig_cusip': row.get('cusip'),                      # Added cusip
        'orig_pool_number': row.get('pool_number')           # Added pool_number
    }
    rdids.append(result)

# Create DataFrame
combined_df = pd.DataFrame(rdids)

# Print as markdown table
print(combined_df.to_markdown(index=False))



| rdid              | orig_agency          | decoded_agency   | orig_mortgage_product   | decoded_mortgage_product   | orig_amortization_type   | decoded_amortization_type      |   orig_coupon_rate |   decoded_coupon_rate | orig_maturity   | decoded_maturity   |   orig_wac |   decoded_wac |   orig_wam |   decoded_wam |   orig_wala |   decoded_wala | orig_symbol   | orig_cusip   | orig_pool_number   |
|:------------------|:---------------------|:-----------------|:------------------------|:---------------------------|:-------------------------|:-------------------------------|-------------------:|----------------------:|:----------------|:-------------------|-----------:|--------------:|-----------:|--------------:|------------:|---------------:|:--------------|:-------------|:-------------------|
| MSL7A3C86003C000G | Fed Home Ln Mtg Corp | Freddie Mac      | S                       | Specialized                | L                        | Level Payment                  |              

In [46]:
# Read the CSV with the original data
df = pd.read_csv('fnma_mortgage_data.csv')
print(len(df))
# Create list to store results
results = []

# Process each RDID in the CSV
for _, row in df.iterrows():
    rdid = row['RDID']
    decoded = decode_rdid(rdid)    
    results.append(decoded)

print(pd.DataFrame(results).to_markdown())

10
|    | issuing_agency   | mortgage_product     | amortization_type   |   coupon_rate |   original_maturity_months |   wac_weighted_average_coupon |   wam_weighted_average_maturity_months |   wala_weighted_average_loan_age_months |   average_loan_size | original_ltv_percent   | raw_rdid          | error                                                                   |
|---:|:-----------------|:---------------------|:--------------------|--------------:|---------------------------:|------------------------------:|---------------------------------------:|----------------------------------------:|--------------------:|:-----------------------|:------------------|:------------------------------------------------------------------------|
|  0 | Fannie Mae       | Manufactured Housing | Balloon             |       2.54839 |                        120 |                           4.3 |                                61.2903 |                                      20 |             2263.71 | 