In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import cudf
import cuml
import cupy as cp

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


# Filter warnings.
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Loadign Data
df_accepted = cudf.read_csv("/content/drive/MyDrive/LendingClub Data/accepted_2007_to_2018Q4.csv")

In [None]:
print(f"The Accepted data has {df_accepted.shape[0]} rows and {df_accepted.shape[1]} columns. ")

In [None]:
def field_specific_null_analysis(df):
    """
    Analyze null values for each column in the dataframe

    Returns:
    - DataFrame with columns: column_name, data_type, null_percentage, filled_percentage
    """
    null_stats = []
    total_rows = len(df)

    for col in df.columns:
        null_count = df[col].isnull().sum()
        null_pct = (null_count / total_rows) * 100
        filled_pct = 100 - null_pct

        null_stats.append({
            'column_name': col,
            'data_type': str(df[col].dtype),
            'null_percentage': round(null_pct, 2),
            'filled_percentage': round(filled_pct, 2)
        })

    null_df = pd.DataFrame(null_stats)
    null_df = null_df.sort_values(by='null_percentage', ascending=False).reset_index(drop=True)
    display(null_df)

In [None]:
field_specific_null_analysis(df_accepted)

In [None]:
df_accepted['loan_status'].value_counts(normalize=True) * 100

In [None]:
def comprehensive_missing_analysis(df, target_col='loan_status'):
    """
    Comprehensive missing value analysis for ALL columns
    """
    results = []
    total_rows = len(df)

    # Overall default and late rates for comparison
    default_statuses = ['Charged Off', 'Default', 'Late (31-120 days)', 'Does not meet the credit policy. Status:Charged Off']
    late_statuses = ['Late (16-30 days)', 'Late (31-120 days)']

    # Calculate overall default and late rates
    overall_default_rate = (df[target_col].isin(default_statuses).sum() / total_rows) * 100
    overall_late_rate = (df[target_col].isin(late_statuses).sum() / total_rows) * 100

    for col in df.columns:
        if col == target_col:
            continue

        missing_count = df[col].isnull().sum()
        missing_pct = (missing_count / total_rows) * 100

        # Calculate non-missing counts and percentages using notnull().sum()
        non_missing_count = df[col].notnull().sum()
        non_missing_pct = (non_missing_count / total_rows) * 100


        if missing_count > 0:
            # Analyze loan status for missing values
            missing_mask = df[col].isnull()
            missing_loan_status = df.loc[missing_mask, target_col].value_counts(normalize=True) * 100

            # Calculate default and late rates for missing values
            missing_default_rate = df.loc[missing_mask, target_col].isin(default_statuses).sum() / missing_count * 100
            missing_late_rate = df.loc[missing_mask, target_col].isin(late_statuses).sum() / missing_count * 100

            # Analyze loan status for non-missing values
            non_missing_mask = df[col].notnull()
            non_missing_default_rate = df.loc[non_missing_mask, target_col].isin(default_statuses).sum() / non_missing_count * 100
            non_missing_late_rate = df.loc[non_missing_mask, target_col].isin(late_statuses).sum() / non_missing_count * 100


            # Decision logic (can be refined based on late status analysis if needed)
            if missing_pct >= 70:
                recommendation = "DROP - Too many missing values"
            elif missing_count < 100:  # Very few missing
                 recommendation = "DROP_ROWS - Very few missing observations"
            elif abs(missing_default_rate - non_missing_default_rate) > 2 or abs(missing_late_rate - non_missing_late_rate) > 2: # Significant difference in default or late rates
                recommendation = "TREAT_AS_CATEGORY - Missing is informative"
            elif abs(missing_default_rate - overall_default_rate) < 1 and abs(missing_late_rate - overall_late_rate) < 1: # Similar to overall default and late rates
                recommendation = "IMPUTE - Missing appears random"
            else:
                recommendation = "INVESTIGATE - Unclear pattern"


            results.append({
                'column': col,
                'missing_count': missing_count,
                'missing_pct': round(missing_pct, 2),
                'non_null_counts': non_missing_count,
                'non_null_pct': round(non_missing_pct, 2),
                'missing_default_rate': round(missing_default_rate, 2),
                'non_missing_default_rate': round(non_missing_default_rate, 2),
                'default_rate_diff': round(missing_default_rate - non_missing_default_rate, 2),
                'missing_late_rate': round(missing_late_rate, 2),
                'non_missing_late_rate': round(non_missing_late_rate, 2),
                'late_rate_diff': round(missing_late_rate - non_missing_late_rate, 2),
                'recommendation': recommendation
            })
        else: # Handle columns with no missing values
             results.append({
                'column': col,
                'missing_count': 0,
                'missing_pct': 0,
                'non_null_counts': total_rows,
                'non_null_pct': 100,
                'missing_default_rate': None,
                'non_missing_default_rate': None,
                'default_rate_diff': None,
                'missing_late_rate': None,
                'non_missing_late_rate': None,
                'late_rate_diff': None,
                'recommendation': 'NO_MISSING - No action needed'
            })



    return pd.DataFrame(results).sort_values('missing_pct', ascending=False).reset_index(drop=True)

In [None]:
def detailed_column_missing_analysis(df, column, target_col='loan_status'):
    """
    Deep dive analysis for a specific column's missing values
    """
    print(f"=== DETAILED ANALYSIS FOR: {column} ===")

    missing_count = df[column].isnull().sum()
    total_rows = len(df)
    missing_pct = (missing_count / total_rows) * 100

    print(f"Missing Values: {missing_count:,} ({missing_pct:.2f}%)")
    print(f"Non-Missing Values: {total_rows - missing_count:,} ({100 - missing_pct:.2f}%)")

    if missing_count > 0:
        print("\n--- LOAN STATUS DISTRIBUTION FOR MISSING VALUES ---")
        missing_status_dist = df.loc[df[column].isnull(), target_col].value_counts(normalize=True) * 100
        print(missing_status_dist.round(2))

        print("\n--- LOAN STATUS DISTRIBUTION FOR NON-MISSING VALUES ---")
        non_missing_status_dist = df.loc[df[column].notnull(), target_col].value_counts(normalize=True) * 100
        print(non_missing_status_dist.round(2))

        # Calculate default and late rates
        default_statuses = ['Charged Off', 'Default', 'Late (31-120 days)', 'Does not meet the credit policy. Status:Charged Off']
        late_statuses = ['Late (16-30 days)', 'Late (31-120 days)']


        missing_default_rate = (df.loc[df[column].isnull(), target_col].isin(default_statuses).sum() / missing_count) * 100
        non_missing_default_rate = (df.loc[df[column].notnull(), target_col].isin(default_statuses).sum() / (total_rows - missing_count)) * 100

        missing_late_rate = (df.loc[df[column].isnull(), target_col].isin(late_statuses).sum() / missing_count) * 100
        non_missing_late_rate = (df.loc[df[column].notnull(), target_col].isin(late_statuses).sum() / (total_rows - missing_count)) * 100


        print(f"\n--- DEFAULT RATE COMPARISON ---")
        print(f"Default Rate (Missing): {missing_default_rate:.2f}%")
        print(f"Default Rate (Non-Missing): {non_missing_default_rate:.2f}%")
        print(f"Difference: {missing_default_rate - non_missing_default_rate:.2f} percentage points")


        print(f"\n--- LATE RATE COMPARISON ---")
        print(f"Late Rate (Missing): {missing_late_rate:.2f}%")
        print(f"Late Rate (Non-Missing): {non_missing_late_rate:.2f}%")
        print(f"Difference: {missing_late_rate - non_missing_late_rate:.2f} percentage points")


        if abs(missing_default_rate - non_missing_default_rate) > 2 or abs(missing_late_rate - non_missing_late_rate) > 2:
            print("SIGNIFICANT DIFFERENCE - Missing values are informative!")
        else:
            print("Similar default and late rates - Missing appears random")


    print("="*60)

In [None]:
# Comprehensive analysis of ALL columns
missing_analysis_df = comprehensive_missing_analysis(df_accepted)
display(missing_analysis_df)

In [None]:
detailed_column_missing_analysis(df_accepted,"bc_util")

In [None]:
missing_analysis_df = comprehensive_missing_analysis(df_accepted)
correlation = missing_analysis_df['default_rate_diff'].corr(missing_analysis_df['late_rate_diff'])
print(f"Correlation between Default Rate Difference and Late Rate Difference: {correlation:.2f}")

In [None]:
def analyze_data_sections(df_accepted: cudf.DataFrame,
                          missing_analysis_df: pd.DataFrame,
                          id_column: str = 'id',
                          tolerance: int = 1000,
                          min_cols_in_section: int = 2):
    """
    Analyzes a DataFrame to identify sections of data based on patterns of missing values.

    This function groups columns that have a similar number of non-null values, identifies
    the samples (rows) corresponding to these groups, and reports on the size of these
    unique sample sections.

    The process works from columns with the most missing data to the least. Samples are
    assigned to the first (most specific) section they qualify for, preventing double-counting
    and ensuring that the resulting sections of samples are disjoint.

    Args:
        df_accepted (cudf.DataFrame): The main DataFrame containing the data and a unique identifier.
                                      Must be a cuDF DataFrame.
        missing_analysis_df (pd.DataFrame): A pandas DataFrame containing the missing value analysis,
                                            including 'column', 'non_null_counts', and 'missing_pct'.
                                            This DataFrame must be sorted by 'non_null_counts' ascending.
        id_column (str): The name of the unique identifier column in df_accepted. Defaults to 'id'.
        tolerance (int): The maximum difference in non_null_counts for columns to be
                         considered part of the same group. Defaults to 1000.
        min_cols_in_section (int): The minimum number of columns required to form a section.
                                   Defaults to 2.
    """

    # Ensure the required columns exist in the analysis dataframe
    required_cols = ['column', 'non_null_counts', 'missing_pct']
    if not all(col in missing_analysis_df.columns for col in required_cols):
        raise ValueError(f"missing_analysis_df must contain the columns: {required_cols}")

    # Ensure the id_column exists in the main dataframe
    if id_column not in df_accepted.columns:
        raise ValueError(f"The specified id_column '{id_column}' does not exist in df_accepted.")

    print(f"Starting section analysis with tolerance={tolerance} and min_cols_in_section={min_cols_in_section}.\n")

    # --- Step 1: Group columns based on similar non_null_counts ---

    analysis_rows = missing_analysis_df.to_dict('records')
    processed_indices = set()
    column_groups = []

    for i in range(len(analysis_rows)):
        if i in processed_indices:
            continue

        seed_row = analysis_rows[i]
        if seed_row['non_null_counts'] == 0:
            processed_indices.add(i)
            continue

        seed_count = seed_row['non_null_counts']
        current_group = [seed_row]
        processed_indices.add(i)

        for j in range(i + 1, len(analysis_rows)):
            if j in processed_indices:
                continue

            candidate_row = analysis_rows[j]
            if abs(candidate_row['non_null_counts'] - seed_count) <= tolerance:
                current_group.append(candidate_row)
                processed_indices.add(j)

        if len(current_group) >= min_cols_in_section:
            column_groups.append(current_group)

    print(f"Found {len(column_groups)} column groups meeting the criteria.")

    # --- Step 2: Identify disjoint sets of samples for each column group ---

    unassigned_ids = df_accepted[[id_column]].copy()
    section_results = []

    for i, group in enumerate(column_groups):
        column_names = [row['column'] for row in group]

        mask = df_accepted[column_names[0]].notnull()
        for col in column_names[1:]:
            mask &= df_accepted[col].notnull()

        potential_section_ids = df_accepted.loc[mask, [id_column]]
        new_section_ids = unassigned_ids.merge(potential_section_ids, on=id_column, how='inner')
        num_new_samples = len(new_section_ids)

        if num_new_samples > 0:
            section_info = {
                'section_index': len(section_results) + 1,
                'num_samples': num_new_samples,
                'defining_columns': [f"{row['column']} [{row['missing_pct']}%]" for row in group]
            }
            section_results.append(section_info)

            if not new_section_ids.empty:
                ids_to_remove = new_section_ids[id_column]
                isin_mask = unassigned_ids[id_column].isin(ids_to_remove)
                unassigned_ids = unassigned_ids[~isin_mask]

    # --- Step 3: Report the results ---

    print("\n--- Section Analysis Results ---\n")
    if not section_results:
        print("No sections were identified based on the provided criteria.")

    for result in section_results:
        print(f"Section {result['section_index']}:")
        print(f"  - Number of unique samples: {result['num_samples']:,}")
        print("  - Defining Columns (and their total missing %):")
        for col_info in result['defining_columns']:
            print(f"    - {col_info}")
        print("-" * 30)

    remaining_samples = len(unassigned_ids)
    if remaining_samples > 0:
        print("Remaining Samples:")
        print(f"  - Number of samples: {remaining_samples:,}")
        print("  - These samples did not exclusively fit into the high-missingness patterns above.")
        print("-" * 30)

In [None]:
# The missing_analysis_df must be sorted by non_null_counts ascending
missing_analysis_df_sorted = missing_analysis_df.sort_values('non_null_counts', ascending=True).reset_index(drop=True)

# Run the analysis
analyze_data_sections(df_accepted, missing_analysis_df_sorted, id_column='id')

In [None]:
import pandas as pd
import cudf
import cupy as cp

def create_data_sections(df_accepted: cudf.DataFrame,
                         missing_analysis_df: pd.DataFrame,
                         id_column: str = 'id',
                         tolerance: int = 1000,
                         min_cols_in_section: int = 2):
    """
    Identifies and creates disjoint data sections based on missing value patterns.

    This function groups columns with similar non-null counts, identifies the unique
    sample sets for each group, and returns these sections as separate DataFrames.
    The process is prioritized, starting with sections defined by columns with the
    most missing data.

    Args:
        df_accepted (cudf.DataFrame): The main DataFrame containing the data. A unique
                                      identifier column is required.
        missing_analysis_df (pd.DataFrame): A DataFrame from the initial analysis, containing
                                            'column', 'non_null_counts', and 'missing_pct'.
                                            Must be sorted by 'non_null_counts' ascending.
        id_column (str): The name of the unique identifier column in df_accepted.
        tolerance (int): The max difference in non_null_counts for columns to form a group.
        min_cols_in_section (int): The minimum number of columns to define a section.

    Returns:
        tuple: A tuple containing:
        - list: A list of dictionary objects. Each dictionary represents a section and contains:
            - 'priority' (int): The priority of the section (1 is highest).
            - 'section_info' (dict): Metadata about the section (sample count, defining columns).
            - 'section_df' (cudf.DataFrame): The actual data for that section.
        - cudf.DataFrame: A DataFrame containing all samples that did not fit into any
                          of the defined sections.
    """

    # --- Initial Validation and Setup ---
    required_cols = ['column', 'non_null_counts', 'missing_pct']
    if not all(col in missing_analysis_df.columns for col in required_cols):
        raise ValueError(f"missing_analysis_df must contain the columns: {required_cols}")

    if id_column not in df_accepted.columns:
        raise ValueError(f"The specified id_column '{id_column}' does not exist in df_accepted.")

    print(f"Starting section creation with tolerance={tolerance} and min_cols_in_section={min_cols_in_section}.\n")

    # --- Step 1: Group columns based on similar non_null_counts ---
    analysis_rows = missing_analysis_df.to_dict('records')
    processed_indices = set()
    column_groups = []

    for i in range(len(analysis_rows)):
        if i in processed_indices: continue
        seed_row = analysis_rows[i]
        if seed_row['non_null_counts'] == 0:
            processed_indices.add(i)
            continue

        seed_count = seed_row['non_null_counts']
        current_group = [seed_row]
        processed_indices.add(i)

        for j in range(i + 1, len(analysis_rows)):
            if j in processed_indices: continue
            candidate_row = analysis_rows[j]
            if abs(candidate_row['non_null_counts'] - seed_count) <= tolerance:
                current_group.append(candidate_row)
                processed_indices.add(j)

        if len(current_group) >= min_cols_in_section:
            column_groups.append(current_group)

    print(f"Found {len(column_groups)} potential column groups.")

    # --- Step 2: Identify disjoint sample IDs and create DataFrames ---

    unassigned_ids = df_accepted[[id_column]].copy()
    sections_output = []

    for group in column_groups:
        column_names = [row['column'] for row in group]

        mask = df_accepted[column_names[0]].notnull()
        for col in column_names[1:]:
            mask &= df_accepted[col].notnull()

        potential_section_ids = df_accepted.loc[mask, [id_column]]
        new_section_ids = unassigned_ids.merge(potential_section_ids, on=id_column, how='inner')
        num_new_samples = len(new_section_ids)

        if num_new_samples > 0:
            # Create the DataFrame for this section by filtering the original df
            # This merge acts as a filter to get all columns for the identified IDs
            section_df = new_section_ids.merge(df_accepted, on=id_column, how='left')

            # Prepare the metadata object for this section
            section_priority = len(sections_output) + 1
            section_info = {
                'num_samples': num_new_samples,
                'defining_columns': [f"{row['column']} [{row['missing_pct']}%]" for row in group]
            }

            sections_output.append({
                'priority': section_priority,
                'section_info': section_info,
                'section_df': section_df.copy() # Explicitly copy to ensure it's a new object
            })

            # Remove these newly assigned IDs from the pool of unassigned IDs
            ids_to_remove = new_section_ids[id_column]
            isin_mask = unassigned_ids[id_column].isin(ids_to_remove)
            unassigned_ids = unassigned_ids[~isin_mask]

    # --- Step 3: Create DataFrame for remaining samples ---

    # The remaining unassigned_ids now define the "rest" of the data
    remaining_df = unassigned_ids.merge(df_accepted, on=id_column, how='left').copy()

    print(f"\nSuccessfully created {len(sections_output)} data sections.")
    print(f"Found {len(remaining_df)} remaining samples.")

    return sections_output, remaining_df

In [None]:
# Call the function to get the sections and the remainder
data_sections, df_remaining = create_data_sections(
    df_accepted,
    missing_analysis_df_sorted,
    id_column='id'
)

In [None]:
# --- Now you can work with the results ---

print("\n--- Accessing the Created Sections ---")

# Check the number of sections created
print(f"Total sections created: {len(data_sections)}")

# Access the section

SECTION_NUMBER = 7

if data_sections:
    section = data_sections[SECTION_NUMBER]
    print(f"\nPriority of section number {SECTION_NUMBER} : {section['priority']}")
    print(f"Number of samples in section number {SECTION_NUMBER} : {section['section_info']['num_samples']}")
    print("Defining columns for first section:")
    for col_info in section['section_info']['defining_columns']:
        print(f"  - {col_info}")

    print(f"\n Section number {SECTION_NUMBER} DataFrame info:")
    section['section_df'].info()

In [None]:
# Check the remaining samples DataFrame
print("\n--- Remaining Samples DataFrame ---")
print(f"Shape of the remaining samples DataFrame: {df_remaining.shape}")
df_remaining.info()

In [None]:
def get_section_overlap_matrix(data_sections: list,
                               df_remaining: cudf.DataFrame,
                               id_column: str = 'id') -> pd.DataFrame:
    """
    Calculates the number of common samples (ID overlap) between all data sections.

    This function generates a confusion matrix where each cell (i, j) contains the
    number of unique IDs that are present in both section i and section j. The diagonal
    (i, i) will contain the total number of samples in that section.

    Args:
        data_sections (list): The list of section objects produced by create_data_sections.
        df_remaining (cudf.DataFrame): The DataFrame of samples not in any other section.
        id_column (str): The name of the unique identifier column.

    Returns:
        pd.DataFrame: A pandas DataFrame representing the overlap matrix.
    """

    # --- Prepare a list of all DataFrames and their labels ---
    all_dfs = [sec['section_df'] for sec in data_sections]
    labels = [f"Section {sec['priority']}" for sec in data_sections]

    if not df_remaining.empty:
        all_dfs.append(df_remaining)
        labels.append("Remaining")

    num_sections = len(all_dfs)
    print(f"Analyzing overlap across {num_sections} total dataframes...")

    # --- Pre-fetch all IDs and store them in sets for efficient intersection ---
    id_sets = [set(df[id_column].to_pandas()) for df in all_dfs]

    # --- Initialize the matrix ---
    overlap_matrix = pd.DataFrame(index=labels, columns=labels, dtype=int)

    # --- Populate the matrix ---
    for i in range(num_sections):
        for j in range(num_sections):
            if i == j:
                # The diagonal contains the total size of the section
                overlap_matrix.iloc[i, j] = len(id_sets[i])
            else:
                # Off-diagonals contain the size of the intersection
                intersection = id_sets[i].intersection(id_sets[j])
                overlap_matrix.iloc[i, j] = len(intersection)

    print("Overlap analysis complete.")
    return overlap_matrix

In [None]:
# --- Step 2: Calculate the overlap data matrix ---
overlap_df = get_section_overlap_matrix(data_sections, df_remaining, id_column='id')

In [None]:
print("\nRaw Overlap Data:")
display(overlap_df)

In [None]:
import numpy as np

def verify_total_sample_count(overlap_matrix: pd.DataFrame,
                                original_df: cudf.DataFrame):
    """
    Verifies that the sum of all disjoint sections equals the total number of samples
    in the original DataFrame.

    Args:
        overlap_matrix (pd.DataFrame): The matrix from get_section_overlap_matrix.
                                       Its diagonal contains the size of each section.
        original_df (cudf.DataFrame): The original, complete DataFrame.
    """

    # --- Calculation ---
    sum_of_sections = np.diag(overlap_matrix.values).sum()
    total_original_samples = len(original_df)

    # --- Verification and Reporting ---
    print("--- Final Data Integrity Check ---")
    print(f"Sum of all samples in all created sections: {int(sum_of_sections):,}")
    print(f"Total samples in the original DataFrame:    {total_original_samples:,}")

    if int(sum_of_sections) == total_original_samples:
        print("\nSUCCESS: The total number of samples matches perfectly.")
        print("This confirms that every sample from the original DataFrame has been accounted for.")
    else:
        difference = total_original_samples - int(sum_of_sections)
        print(f"\nERROR: There is a mismatch of {difference:,} samples.")
        print("This indicates that some data was lost or duplicated during the process.")
    print("------------------------------------")


# --- How to run the verification ---
# (Assuming overlap_df and df_accepted are already created)

verify_total_sample_count(overlap_df, df_accepted)

In [None]:
import pandas as pd
import cudf

def analyze_section_nulls(data_sections: list, df_remaining: cudf.DataFrame, section_number: int = None):
    """
    Performs and displays a detailed null value analysis for specific data sections.

    For a given section DataFrame (or all sections if section_number is None)
    and the remaining DataFrame, this function calculates the number and percentage
    of null values for every column relative to the size of that specific section.

    Args:
        data_sections (list): The list of section objects from create_data_sections.
        df_remaining (cudf.DataFrame): The DataFrame of samples not in any other section.
        section_number (int, optional): The priority number of the section to analyze.
                                        If None, all sections and the remaining data
                                        will be analyzed. Defaults to None.
    """

    # --- Combine all dataframes into a single list for easier iteration ---
    all_section_data = []
    for section in data_sections:
        # Create a descriptor for each section using its priority
        label = f"Section {section['priority']}"
        df = section['section_df']
        all_section_data.append({'label': label, 'df': df, 'priority': section['priority']})

    # Add the 'Remaining' dataframe to the list if it's not empty
    if not df_remaining.empty:
        all_section_data.append({'label': 'Remaining', 'df': df_remaining, 'priority': None}) # Remaining has no priority number

    # --- Iterate through each dataframe and perform the analysis ---
    for section_info in all_section_data:
        # Skip sections if a specific section number is requested and it doesn't match
        if section_number is not None and section_info['priority'] != section_number:
            continue

        df = section_info['df']
        label = section_info['label']
        total_rows = len(df)

        print("="*70)
        print(f"Null Value Analysis for: {label} ({total_rows:,} Samples)")
        print("="*70)

        if total_rows == 0:
            print("Section is empty. No analysis to perform.")
            continue

        # --- Calculate nulls for each column in the current dataframe ---
        null_stats = []
        for col in df.columns:
            # .isnull().sum() on a cuDF series returns a scalar value
            # .item() safely converts it to a standard Python number
            null_count = df[col].isnull().sum().item()
            null_pct = (null_count / total_rows) * 100

            null_stats.append({
                'Column': col,
                'Null Count': null_count,
                'Null Percentage (%)': round(null_pct, 2)
            })

        # Create a pandas DataFrame for better display and sorting
        analysis_table = pd.DataFrame(null_stats)

        # Sort the table to show columns with the most missing values on top
        analysis_table = analysis_table.sort_values(
            by='Null Percentage (%)',
            ascending=False
        ).reset_index(drop=True)

        # Use display() for clean, interactive table formatting in a notebook
        display(analysis_table)

In [None]:
# analyze_section_nulls(data_sections, df_remaining) # Analyze all sections
analyze_section_nulls(data_sections, df_remaining, section_number=11) # Analyze only Section 1

In [None]:
# ==============================================================================
# Exhibit A-1: Risk-Adjusted Pricing Forensics
# ==============================================================================

# --- Import necessary libraries for analysis ---
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# ------------------------------------------------------------------------------
# STEP 0: DEFINE THE COHORTS FOR ANALYSIS
# ------------------------------------------------------------------------------
# Based on our approved strategy, we are focusing on the first 7 sections
# as they are statistically significant and represent distinct business processes.

cohort_definitions = {
    "1: Hardship": data_sections[0]['section_df'],
    "2: Settlement": data_sections[1]['section_df'],
    "3: Joint App (Full Profile)": data_sections[2]['section_df'],
    "4: Joint App (Income Only)": data_sections[3]['section_df'],
    "5: Individual (Enriched Data)": data_sections[4]['section_df'],
    "6: Individual (Bankcard Data)": data_sections[5]['section_df'],
    "7: Individual (Legacy Data)": data_sections[6]['section_df']
}

print(f"Successfully defined {len(cohort_definitions)} cohorts for forensic analysis.")

In [None]:
# ------------------------------------------------------------------------------
# STEP 1: VISUAL DIAGNOSTICS - Linearity Check
# ------------------------------------------------------------------------------

def create_diagnostic_scatter_plots(cohorts: dict):
    """Creates a grid of scatter plots to visually inspect the linearity assumption."""
    print("\nExecuting Step 1: Generating visual diagnostics for linearity assumption...")

    # Create a 3x3 grid for our 7 cohorts
    fig = make_subplots(
        rows=3, cols=3,
        subplot_titles=list(cohorts.keys()),
        vertical_spacing=0.1,
        horizontal_spacing=0.08
    )

    row, col = 1, 1
    for name, df_cudf in cohorts.items():
        sample_size = len(df_cudf)
        df_pd = df_cudf[['fico_range_high', 'int_rate']].sample(n=sample_size).to_pandas().dropna()

        # Create scatter plot
        fig.add_trace(go.Scatter(
            x=df_pd['fico_range_high'], y=df_pd['int_rate'],
            mode='markers', marker=dict(size=3, opacity=0.4), name=name
        ), row=row, col=col)

        # Add regression line
        trendline = px.scatter(df_pd, x='fico_range_high', y='int_rate', trendline="ols").data[1]
        fig.add_trace(trendline, row=row, col=col)

        if col == 3:
            col = 1
            row += 1
        else:
            col += 1

    fig.update_layout(
        title_text="<b>Exhibit A-1.1: Visual Diagnostic of FICO vs. Interest Rate Linearity</b><br><sup>(Per-Cohort Analysis)</sup>",
        height=800, width=1200, showlegend=False
    )
    fig.update_xaxes(title_text="FICO Score (High Range)")
    fig.update_yaxes(title_text="Interest Rate (%)")
    fig.show()

In [None]:
# --- Execute Step 1 ---
create_diagnostic_scatter_plots(cohort_definitions)

In [None]:
# ------------------------------------------------------------------------------
# STEP 2 & 3: CORE ANALYSIS - R-squared and RMSE Calculation
# ------------------------------------------------------------------------------

def run_pricing_analysis_per_cohort(cohorts: dict):
    """Runs regression for each cohort to calculate R-squared and RMSE."""
    print("\nExecuting Steps 2 & 3: Calculating R-squared and RMSE for each cohort...")

    results = []
    for name, df_cudf in cohorts.items():
        # Convert only necessary columns to pandas to save memory, and drop nulls
        df_pd = df_cudf[['fico_range_high', 'int_rate']].dropna().to_pandas()

        if len(df_pd) < 2:
            print(f"  -> Skipping cohort '{name}' due to insufficient data.")
            continue

        X = df_pd[['fico_range_high']]
        y = df_pd['int_rate']

        # Fit the linear regression model
        model = LinearRegression()
        model.fit(X, y)

        # Calculate metrics
        r_squared = model.score(X, y)
        predictions = model.predict(X)
        rmse = np.sqrt(mean_squared_error(y, predictions))

        results.append({
            "Cohort": name,
            "Sample Size (n)": len(df_pd),
            "R-squared": r_squared,
            "RMSE (int_rate %)": rmse
        })
        print(f"  -> Analysis complete for cohort: {name}")

    return pd.DataFrame(results)

In [None]:
# --- Execute Steps 2 & 3 ---
analysis_results_df = run_pricing_analysis_per_cohort(cohort_definitions)

print("\n--- Forensic Summary Table ---")
display(analysis_results_df.sort_values(by="R-squared", ascending=False).reset_index(drop=True))

In [None]:
# ------------------------------------------------------------------------------
# STEP 4: FINAL VISUALIZATION - R-squared Comparison
# ------------------------------------------------------------------------------

def create_r_squared_bar_chart(results_df: pd.DataFrame):
    """Creates the final summary bar chart of R-squared values."""
    print("\nExecuting Step 4: Generating final R-squared comparison chart...")

    results_df = results_df.sort_values(by="R-squared", ascending=False)

    fig = px.bar(
        results_df,
        x='Cohort',
        y='R-squared',
        text=results_df['R-squared'].apply(lambda x: f'{x:.3f}'),
        color='R-squared',
        color_continuous_scale='Blues',
        labels={'R-squared': 'R-squared (FICO vs. Interest Rate)'}
    )

    fig.update_layout(
        title_text="<b>Exhibit A-1.2: FICO's Explanatory Power on Interest Rate Pricing</b><br><sup>(A Higher R-squared Implies a More FICO-driven Pricing Model)</sup>",
        yaxis_range=[0,1]
    )
    fig.update_traces(textposition='outside')
    fig.show()

In [None]:
# --- Execute Step 4 ---
create_r_squared_bar_chart(analysis_results_df)

# 1A DONE


---
---



---
---
# 2A


In [None]:
# ==============================================================================
# EXHIBIT A-2: VINTAGE ANALYSIS FORENSICS
# Lending Club Operational Forensics Project
# ==============================================================================
# Objective: Investigate the impact of the 2012 "Great Data Enrichment" by
# comparing the underwriting inputs and performance outcomes of two distinct
# loan vintages from the "Core Individual Cohort".
#
# Key Assumption: Date of Default = Last Payment Date + 1 month
# ==============================================================================

# --- Import necessary libraries ---
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set display options for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

print("=" * 80)
print("EXHIBIT A-2: VINTAGE ANALYSIS FORENSICS")
print("Assessing the Impact of the 2012 'Great Data Enrichment'")
print("=" * 80)
print()

In [None]:
# ==============================================================================
# STEP 1: CONSTRUCT THE "CORE INDIVIDUAL COHORT"
# ==============================================================================
print("STEP 1: Constructing the 'Core Individual Cohort'")
print("-" * 60)

# Concatenate all individual loan sections (excluding joint/hardship/settlement)
core_individual_cohort_pd = pd.concat([
    data_sections[4]['section_df'].to_pandas(),  # Individual (Enriched Data)
    data_sections[5]['section_df'].to_pandas(),  # Individual (Bankcard Data)
    data_sections[6]['section_df'].to_pandas(),  # Individual (Legacy Data)
    data_sections[7]['section_df'].to_pandas(),  # Additional individual sections
    data_sections[8]['section_df'].to_pandas(),
    data_sections[9]['section_df'].to_pandas(),
    data_sections[10]['section_df'].to_pandas(),
    df_remaining.to_pandas()  # Remaining unclassified individual loans
], ignore_index=True)

In [None]:
# Convert date columns to datetime immediately after concatenation
print("Converting date columns...")
core_individual_cohort_pd['issue_d'] = pd.to_datetime(
    core_individual_cohort_pd['issue_d'],
    format='%b-%Y',
    errors='coerce'
)
core_individual_cohort_pd['last_pymnt_d'] = pd.to_datetime(
    core_individual_cohort_pd['last_pymnt_d'],
    format='%b-%Y',
    errors='coerce'
)

print(f"✓ Core Individual Cohort constructed")
print(f"  • Total samples: {len(core_individual_cohort_pd):,}")
# Filter out rows where 'issue_d' could not be converted to datetime (NaT) before finding min/max
valid_issue_dates = core_individual_cohort_pd['issue_d'].dropna()
if not valid_issue_dates.empty:
    print(f"  • Date range: {valid_issue_dates.min().strftime('%b-%Y')} to {valid_issue_dates.max().strftime('%b-%Y')}")
else:
    print("  • Date range: No valid issue dates found after conversion.")

print()

In [None]:
# ==============================================================================
# STEP 2: DATA PREPARATION & VINTAGE DEFINITION
# ==============================================================================
print("STEP 2: Data Preparation & Vintage Definition")
print("-" * 60)

# Convert date columns to datetime
print("Converting date columns...")
core_individual_cohort_pd['issue_d'] = pd.to_datetime(
    core_individual_cohort_pd['issue_d'],
    format='%b-%Y',
    errors='coerce'
)
core_individual_cohort_pd['last_pymnt_d'] = pd.to_datetime(
    core_individual_cohort_pd['last_pymnt_d'],
    format='%b-%Y',
    errors='coerce'
)

# Data snapshot date (end of dataset)
snapshot_date = pd.to_datetime('2018-12-31')

# Define default statuses
default_statuses = ['Charged Off', 'Default']

In [None]:
# ==============================================================================
# CALCULATE DEFAULT TIMING WITH OUR ASSUMPTION
# ==============================================================================
print("\nCalculating default timing (Assumption: Default = Last Payment + 1 month)...")

# Create default date based on our assumption
defaulted_mask = core_individual_cohort_pd['loan_status'].isin(default_statuses)

# For defaulted loans: default_date = last_payment_date + 1 month
core_individual_cohort_pd.loc[defaulted_mask, 'default_date'] = (
    core_individual_cohort_pd.loc[defaulted_mask, 'last_pymnt_d'] + pd.DateOffset(months=1)
)

# Calculate months to default for defaulted loans
core_individual_cohort_pd.loc[defaulted_mask, 'months_to_default'] = (
    (core_individual_cohort_pd.loc[defaulted_mask, 'default_date'] -
     core_individual_cohort_pd.loc[defaulted_mask, 'issue_d']).dt.days / 30.44
).round()

# For non-defaulted loans, calculate observation period
core_individual_cohort_pd.loc[~defaulted_mask, 'observation_months'] = (
    (snapshot_date - core_individual_cohort_pd.loc[~defaulted_mask, 'issue_d']).dt.days / 30.44
).round()

In [None]:
# ==============================================================================
# FILTER FOR 36-MONTH OBSERVATION WINDOW
# ==============================================================================
print("\nFiltering for 36-month observation window...")

# Only include loans with at least 36 months of observation
# This means loans issued before 2016-01-01 (to have 36 months by end of 2018)
cutoff_date = pd.to_datetime('2016-01-01')
analysis_sample_df = core_individual_cohort_pd[
    core_individual_cohort_pd['issue_d'] < cutoff_date
].copy()

print(f"✓ Filtered to loans with 36+ month observation period")
print(f"  • Samples after filtering: {len(analysis_sample_df):,}")
print(f"  • Excluded recent loans: {len(core_individual_cohort_pd) - len(analysis_sample_df):,}")

In [None]:
# ==============================================================================
# DEFINE VINTAGES (CORRECTED ERA SPLIT)
# ==============================================================================
print("\nDefining vintages based on temporal analysis...")

# CORRECT ERA DEFINITION: 2007-2011 vs 2012+
analysis_sample_df['era'] = np.where(
    analysis_sample_df['issue_d'].dt.year <= 2011,
    'Crisis-Era (2007-2011)',
    'Expansion Era (2012-2015)'  # Limited to 2015 for 36-month observation
)

# Calculate 36-month default flag
analysis_sample_df['defaulted_within_36m'] = (
    (analysis_sample_df['loan_status'].isin(default_statuses)) &
    (analysis_sample_df['months_to_default'] <= 36)
).astype(int)

# Show vintage distribution
vintage_dist = analysis_sample_df['era'].value_counts()
print(f"\n✓ Vintages defined:")
for era, count in vintage_dist.items():
    print(f"  • {era}: {count:,} loans")
print()

In [None]:
# ==============================================================================
# STEP 3: COMPARATIVE ANALYSIS & STATISTICAL TESTING
# ==============================================================================
print("STEP 3: Comparative Analysis & Statistical Testing")
print("-" * 60)

# Initialize results storage
vintage_results = []
eras = ['Crisis-Era (2007-2011)', 'Expansion Era (2012-2015)']

# Analyze each vintage
for era in eras:
    era_df = analysis_sample_df[analysis_sample_df['era'] == era]

    # Calculate metrics
    results = {
        'Era': era,
        'Sample Size': len(era_df),

        # FICO Score Statistics
        'Avg FICO Score': era_df['fico_range_high'].mean(),
        'FICO Std Dev': era_df['fico_range_high'].std(),
        'FICO 25th Pct': era_df['fico_range_high'].quantile(0.25),
        'FICO Median': era_df['fico_range_high'].quantile(0.50),
        'FICO 75th Pct': era_df['fico_range_high'].quantile(0.75),

        # DTI Statistics
        'Avg DTI': era_df['dti'].mean(),
        'DTI Std Dev': era_df['dti'].std(),
        'DTI Median': era_df['dti'].quantile(0.50),

        # Pricing & Performance
        'Avg Interest Rate': era_df['int_rate'].mean(),
        '36m Default Rate (%)': (era_df['defaulted_within_36m'].sum() / len(era_df)) * 100,
        'Total Defaults': era_df['defaulted_within_36m'].sum()
    }
    vintage_results.append(results)

# Create summary DataFrame
summary_df = pd.DataFrame(vintage_results).set_index('Era').T

In [None]:
# ==============================================================================
# STATISTICAL SIGNIFICANCE TESTING
# ==============================================================================
print("\nPerforming statistical significance tests...")

crisis_df = analysis_sample_df[analysis_sample_df['era'] == eras[0]]
expansion_df = analysis_sample_df[analysis_sample_df['era'] == eras[1]]

# T-tests for continuous variables
_, p_val_fico = ttest_ind(
    crisis_df['fico_range_high'].dropna(),
    expansion_df['fico_range_high'].dropna()
)
_, p_val_dti = ttest_ind(
    crisis_df['dti'].dropna(),
    expansion_df['dti'].dropna()
)
_, p_val_int_rate = ttest_ind(
    crisis_df['int_rate'].dropna(),
    expansion_df['int_rate'].dropna()
)

# Chi-square test for default rates
contingency_table = pd.crosstab(
    analysis_sample_df['era'],
    analysis_sample_df['defaulted_within_36m']
)
_, p_val_default, _, _ = chi2_contingency(contingency_table)

# Create p-value summary
p_values = {
    'Avg FICO Score': p_val_fico,
    'Avg DTI': p_val_dti,
    'Avg Interest Rate': p_val_int_rate,
    '36m Default Rate (%)': p_val_default
}

In [None]:
# ==============================================================================
# DISPLAY RESULTS TABLE
# ==============================================================================
print("\n" + "=" * 80)
print("FORENSIC SUMMARY TABLE: Vintage Comparison")
print("=" * 80)

# Create a clean display table
display_df = summary_df.copy()

# Add significance indicators
for metric, p_val in p_values.items():
    if metric in display_df.index:
        sig_marker = '***' if p_val < 0.001 else '**' if p_val < 0.01 else '*' if p_val < 0.05 else ''
        if sig_marker:
            display_df.loc[metric, 'Expansion Era (2012-2015)'] = f"{display_df.loc[metric, 'Expansion Era (2012-2015)']:.2f}{sig_marker}"

print(display_df)
print("\n* p<0.05, ** p<0.01, *** p<0.001")
print()

# Print key findings
print("KEY FINDINGS:")
print("-" * 40)

# Calculate differences
fico_diff = summary_df.loc['Avg FICO Score', 'Expansion Era (2012-2015)'] - summary_df.loc['Avg FICO Score', 'Crisis-Era (2007-2011)']
dti_diff = summary_df.loc['Avg DTI', 'Expansion Era (2012-2015)'] - summary_df.loc['Avg DTI', 'Crisis-Era (2007-2011)']
rate_diff = summary_df.loc['Avg Interest Rate', 'Expansion Era (2012-2015)'] - summary_df.loc['Avg Interest Rate', 'Crisis-Era (2007-2011)']
default_diff = summary_df.loc['36m Default Rate (%)', 'Expansion Era (2012-2015)'] - summary_df.loc['36m Default Rate (%)', 'Crisis-Era (2007-2011)']

print(f"• FICO Score Change: {fico_diff:+.1f} points")
print(f"• DTI Change: {dti_diff:+.1f} percentage points")
print(f"• Interest Rate Change: {rate_diff:+.1f} percentage points")
print(f"• Default Rate Change: {default_diff:+.2f} percentage points")
print()

In [None]:
# ==============================================================================
# STEP 4: VISUALIZATION
# ==============================================================================
print("\n STEP 4: Generating Forensic Visualizations")
print("-" * 60)

# Prepare data for plotting
plot_metrics = ['Avg FICO Score', 'Avg DTI', 'Avg Interest Rate', '36m Default Rate (%)']
plot_data = summary_df.loc[plot_metrics].T.reset_index()
plot_data.rename(columns={'index': 'Era'}, inplace=True)

# US Unemployment Rate Data (for macroeconomic context)
unemployment_data = {
    2007: 4.6, 2008: 5.8, 2009: 9.3, 2010: 9.6, 2011: 8.9,
    2012: 8.1, 2013: 7.4, 2014: 6.2, 2015: 5.3, 2016: 4.9, 2017: 4.4
}
unemployment_df = pd.DataFrame(
    list(unemployment_data.items()),
    columns=['Year', 'Unemployment Rate (%)']
)

# Create figure with subplots
fig = make_subplots(
    rows=2, cols=1,
    row_heights=[0.7, 0.3],
    vertical_spacing=0.12,
    subplot_titles=(
        "<b>Vintage Performance: Crisis-Era vs. Expansion Era</b>",
        "<b>Macroeconomic Context: U.S. Unemployment Rate</b>"
    )
)

# Color scheme
colors = {
    'Avg FICO Score': '#1f77b4',
    'Avg DTI': '#ff7f0e',
    'Avg Interest Rate': '#2ca02c',
    '36m Default Rate (%)': '#d62728'
}

# Panel 1: Comparative Bar Chart
x_positions = np.arange(len(plot_metrics))
width = 0.35

for i, era in enumerate(plot_data['Era']):
    values = plot_data.loc[plot_data['Era'] == era, plot_metrics].values[0]

    fig.add_trace(
        go.Bar(
            name=era,
            x=plot_metrics,
            y=values,
            text=[f'{v:.2f}' if v < 100 else f'{v:.0f}' for v in values],
            textposition='outside',
            marker_color='#1f77b4' if 'Crisis' in era else '#ff7f0e',
            offsetgroup=i
        ),
        row=1, col=1
    )

# Panel 2: Unemployment Rate Timeline
fig.add_trace(
    go.Scatter(
        x=unemployment_df['Year'],
        y=unemployment_df['Unemployment Rate (%)'],
        mode='lines+markers',
        name='Unemployment Rate',
        line=dict(color='#7f7f7f', width=2),
        marker=dict(size=8),
        showlegend=False
    ),
    row=2, col=1
)

# Add vertical line at 2012 (Great Data Enrichment)
fig.add_vline(
    x=2012,
    line_width=2,
    line_dash="dash",
    line_color="red",
    annotation_text="2012 Data Enrichment",
    row=2, col=1
)

# Shade the two eras
fig.add_vrect(
    x0=2007, x1=2011.5,
    fillcolor="blue", opacity=0.1,
    line_width=0,
    row=2, col=1
)
fig.add_vrect(
    x0=2011.5, x1=2017.5,
    fillcolor="orange", opacity=0.1,
    line_width=0,
    row=2, col=1
)

# Update layout
fig.update_layout(
    height=800,
    title_text="<b>EXHIBIT A-2: FORENSIC ANALYSIS OF UNDERWRITING ERAS</b><br><sup>Impact Assessment of the 2012 'Great Data Enrichment' Initiative</sup>",
    title_font_size=16,
    barmode='group',
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    hovermode='x unified'
)

# Update axes
fig.update_xaxes(title_text="Metrics", row=1, col=1)
fig.update_yaxes(title_text="Value", row=1, col=1)
fig.update_xaxes(title_text="Year", row=2, col=1)
fig.update_yaxes(title_text="Unemployment Rate (%)", row=2, col=1)

fig.show()

In [None]:
# ==============================================================================
# STEP 5: ADDITIONAL ANALYSIS - DISTRIBUTION COMPARISON
# ==============================================================================
print("\n STEP 5: Distribution Analysis")
print("-" * 60)

# Create distribution comparison plots
fig_dist = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'FICO Score Distribution',
        'DTI Distribution',
        'Interest Rate Distribution',
        'Default Rate by Loan Grade'
    )
)

# FICO Distribution
for era in eras:
    era_data = analysis_sample_df[analysis_sample_df['era'] == era]['fico_range_high'].dropna()
    fig_dist.add_trace(
        go.Histogram(
            x=era_data,
            name=era.split(' ')[0],  # Shortened name
            opacity=0.6,
            nbinsx=30
        ),
        row=1, col=1
    )

# DTI Distribution
for era in eras:
    era_data = analysis_sample_df[analysis_sample_df['era'] == era]['dti'].dropna()
    fig_dist.add_trace(
        go.Histogram(
            x=era_data,
            name=era.split(' ')[0],
            opacity=0.6,
            nbinsx=30
        ),
        row=1, col=2
    )

# Interest Rate Distribution
for era in eras:
    era_data = analysis_sample_df[analysis_sample_df['era'] == era]['int_rate'].dropna()
    fig_dist.add_trace(
        go.Histogram(
            x=era_data,
            name=era.split(' ')[0],
            opacity=0.6,
            nbinsx=30
        ),
        row=2, col=1
    )

# Default Rate by Grade
grade_defaults = analysis_sample_df.groupby(['era', 'grade'])['defaulted_within_36m'].agg(['mean', 'count'])
grade_defaults['mean'] = grade_defaults['mean'] * 100  # Convert to percentage

for era in eras:
    era_grades = grade_defaults.loc[era]
    era_grades = era_grades[era_grades['count'] > 100]  # Filter for sufficient sample size
    fig_dist.add_trace(
        go.Bar(
            x=era_grades.index,
            y=era_grades['mean'],
            name=era.split(' ')[0],
            text=[f'{v:.1f}%' for v in era_grades['mean']],
            textposition='outside'
        ),
        row=2, col=2
    )

# Update layout
fig_dist.update_layout(
    height=700,
    title_text="<b>Distribution Comparison: Crisis-Era vs. Expansion Era</b>",
    showlegend=True,
    barmode='group'
)

# Update axes labels
fig_dist.update_xaxes(title_text="FICO Score", row=1, col=1)
fig_dist.update_xaxes(title_text="DTI (%)", row=1, col=2)
fig_dist.update_xaxes(title_text="Interest Rate (%)", row=2, col=1)
fig_dist.update_xaxes(title_text="Loan Grade", row=2, col=2)
fig_dist.update_yaxes(title_text="Count", row=1, col=1)
fig_dist.update_yaxes(title_text="Count", row=1, col=2)
fig_dist.update_yaxes(title_text="Count", row=2, col=1)
fig_dist.update_yaxes(title_text="Default Rate (%)", row=2, col=2)

fig_dist.show()

In [None]:
# ==============================================================================
# STEP 6: ROBUSTNESS CHECK - ALTERNATIVE OBSERVATION WINDOWS
# ==============================================================================
print("\n STEP 6: Robustness Check - Alternative Observation Windows")
print("-" * 60)

# Test with 24-month and 48-month windows
robustness_results = []

for window in [24, 36, 48]:
    # Calculate defaults within window
    temp_df = analysis_sample_df.copy()
    temp_df[f'defaulted_within_{window}m'] = (
        (temp_df['loan_status'].isin(default_statuses)) &
        (temp_df['months_to_default'] <= window)
    ).astype(int)

    # Calculate default rates by era
    for era in eras:
        era_df = temp_df[temp_df['era'] == era]
        default_rate = (era_df[f'defaulted_within_{window}m'].sum() / len(era_df)) * 100

        robustness_results.append({
            'Window': f'{window} months',
            'Era': era,
            'Default Rate (%)': default_rate,
            'Sample Size': len(era_df)
        })

robustness_df = pd.DataFrame(robustness_results).pivot(
    index='Window',
    columns='Era',
    values='Default Rate (%)'
)

print("\nDefault Rates Across Different Observation Windows:")
print(robustness_df)
print()