In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# --- Configuration ---
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.complaint_analyst.data_processing import load_complaints, preprocess_data

# --- Constants ---
RAW_DATA_PATH = '../data/raw/complaints.csv'
PROCESSED_DATA_PATH = '../data/processed/filtered_complaints.csv'
PRODUCTS_TO_INCLUDE = [
    'Credit card or prepaid card',
    'Personal loan',
    'Payday loan, title loan, or personal loan', # This will be grouped into 'Personal loan'
    'Checking or savings account',
    'Money transfer, virtual currency, or money service'
]

# --- Plotting Style ---
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
df_raw = load_complaints(RAW_DATA_PATH)
df_raw.info()

In [None]:
print("Top 20 products by complaint volume:")
product_counts = df_raw['Product'].value_counts().nlargest(20)
print(product_counts)

# Visualize the distribution
plt.figure(figsize=(10, 8))
sns.barplot(y=product_counts.index, x=product_counts.values, palette='viridis')
plt.title('Top 20 Products by Complaint Volume')
plt.xlabel('Number of Complaints')
plt.ylabel('Product')
plt.show()

In [None]:
narrative_counts = df_raw['Consumer complaint narrative'].notna().value_counts()
print("Complaints with/without narratives:")
print(narrative_counts)

narrative_counts.plot(kind='pie', labels=['With Narrative', 'Without Narrative'], autopct='%1.1f%%',
                      title='Proportion of Complaints With Narratives', legend=False)
plt.ylabel('') # Hide the y-label
plt.show()

df_with_narratives = df_raw.dropna(subset=['Consumer complaint narrative'])
narrative_lengths = df_with_narratives['Consumer complaint narrative'].str.split().str.len()

plt.figure(figsize=(12, 6))
sns.histplot(narrative_lengths, bins=50, kde=True)
plt.title('Distribution of Complaint Narrative Length (Word Count)')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.xlim(0, 1000) # Zoom in on the most common lengths
plt.show()

print("\nDescriptive statistics for narrative length:")
print(narrative_lengths.describe())

In [None]:
df_to_process = df_raw.copy()

product_mapping = {
    'Credit card or prepaid card': 'Credit Card',
    'Credit card': 'Credit Card',
    'Prepaid card': 'Credit Card',
    'Personal loan': 'Personal Loan',
    'Payday loan, title loan, or personal loan': 'Personal Loan',
    'Payday loan': 'Personal Loan',
    'Checking or savings account': 'Savings Account',
    'Money transfer, virtual currency, or money service': 'Money Transfers',
    'Virtual currency': 'Money Transfers',
}
df_to_process['Product'] = df_to_process['Product'].map(product_mapping).fillna(df_to_process['Product'])

# Standardize product list for filtering
PRODUCTS_TO_INCLUDE_STANDARDIZED = ['Credit Card', 'Personal Loan', 'Savings Account', 'Money Transfers']


df_processed = preprocess_data(df_to_process, PRODUCTS_TO_INCLUDE_STANDARDIZED)

print("\nProcessed DataFrame columns and head:")
print(df_processed.info())
print(df_processed.head())

In [None]:
pd.set_option('display.max_colwidth', None)
print("Example of a raw vs. cleaned narrative:")
print(df_processed[['narrative', 'narrative_cleaned']].head(1))

In [None]:
print(f"Saving cleaned data to {PROCESSED_DATA_PATH}...")
os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)
df_processed.to_csv(PROCESSED_DATA_PATH, index=False)
print("Done.")