Import and Load data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

# Create directory if it doesn't exist
os.makedirs('data/processed', exist_ok=True)

# Formatting for plots
sns.set_theme(style="whitegrid")
%matplotlib inline

Load and Initial Filter

In [None]:
# Load the raw data
df = pd.read_csv('data/raw/complaints.csv', low_memory=False)

# Define mapping for CrediTrust core products
product_mapping = {
    'Credit card or prepaid card': 'Credit Card',
    'Payday loan, title loan, or personal loan': 'Personal Loan',
    'Checking or savings account': 'Savings Account',
    'Money transfer, virtual currency, or money service': 'Money Transfer'
}

# Filter for relevant products
df_filtered = df[df['Product'].isin(product_mapping.keys())].copy()
df_filtered['product_category'] = df_filtered['Product'].map(product_mapping)

print(f"Total Rows: {len(df)}")
print(f"Relevant Rows (CrediTrust Products): {len(df_filtered)}")

Narrative Quality Analysis

In [None]:
# Check for null narratives
null_counts = df_filtered['Consumer complaint narrative'].isnull().sum()
print(f"Complaints without text narratives: {null_counts}")

# Remove rows without narratives
df_cleaned = df_filtered.dropna(subset=['Consumer complaint narrative']).copy()

# Calculate word counts
df_cleaned['word_count'] = df_cleaned['Consumer complaint narrative'].apply(lambda x: len(str(x).split()))

# Visualize
plt.figure(figsize=(12, 5))
sns.histplot(df_cleaned['word_count'], bins=100, color='blue', kde=True)
plt.title('Distribution of Complaint Length (Word Count)')
plt.xlim(0, 1000) # Most narratives fall under 1000 words
plt.show()

print(f"Average word count: {df_cleaned['word_count'].mean():.2f}")