In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter

In [None]:
plt.style.use('ggplot')
%matplotlib inline

In [None]:
df.info(verbose=True, show_counts=True)

missing = df.isna().sum().sort_values(ascending=False)
missing[missing > 0].plot(kind='barh', title='Missing Values by Column')
plt.show()

In [None]:
product_counts = df['Product'].value_counts()

plt.figure(figsize=(10,6))
product_counts.plot(kind='barh')
plt.title('Complaints by Product Category')
plt.xlabel('Count')
plt.ylabel('Product')
plt.show()

In [None]:
target_products = [
    'Credit card', 
    'Personal loan', 
    'Payday loan',  # Often used for BNPL
    'Money transfer', 
    'Bank account or service'  # Will filter to savings accounts later
]

filtered = df[df['Product'].isin(target_products)].copy()
print(f"Filtered shape: {filtered.shape}")

In [None]:
has_narrative = filtered['Consumer complaint narrative'].notna()
print(f"Complaints with narratives: {has_narrative.sum()} ({has_narrative.mean():.1%})")

In [None]:
filtered = filtered[has_narrative].copy()
print(f"Final shape with narratives: {filtered.shape}")


In [None]:
filtered['narrative_length'] = filtered['Consumer complaint narrative'].str.split().str.len()

plt.figure(figsize=(10,5))
sns.histplot(filtered['narrative_length'], bins=50)
plt.axvline(filtered['narrative_length'].median(), color='k', linestyle='--')
plt.title('Distribution of Complaint Narrative Lengths (Words)')
plt.xlabel('Word Count')
plt.show()


In [None]:
filtered['narrative_length'].describe()


In [None]:
def clean_complaint_text(text):
    """Process complaint narratives for embedding"""
    if not isinstance(text, str):
        return ""
    
    
    patterns_to_remove = [
        r'XX/XX/\d{4}',  
        r'\d{10,}',       
        r'\b(?:dear|sincerely|regards|thank you)\b[^\.,;:!?]*[.,;:!?]', 
        r'[^\w\s.,;:!?]'  
    
    text = text.lower()
    for pattern in patterns_to_remove:
        text = re.sub(pattern, '', text)
    
    text = ' '.join(text.split())
    
    return text

In [None]:
filtered['clean_text'] = filtered['Consumer complaint narrative'].progress_apply(clean_complaint_text)

print("Original:\n", filtered.iloc[0]['Consumer complaint narrative'])
print("\nCleaned:\n", filtered.iloc[0]['clean_text'])

filtered.loc[filtered['Product'] == 'Bank account or service', 'Product'] = 'Savings account'

In [None]:
output_path = '../data/processed/filtered_complaints.csv'
filtered.to_csv(output_path, index=False)
print(f"Saved processed data to {output_path}")

# %%
# Final dataset summary
print("=== Final Dataset Summary ===")
print(f"Records: {len(filtered)}")
print("\nProduct Distribution:")
print(filtered['Product'].value_counts())
print("\nText Length Stats:")
print(filtered['narrative_length'].describe())