In [2]:
# notebooks/01_eda_preprocessing.ipynb

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# --- 1. Load the dataset ---
print("Loading dataset...")
# Corrected: Assume the raw dataset is saved as 'data/complaints.csv'
try:
    df = pd.read_csv('data/complaints.csv', encoding='utf-8')
except FileNotFoundError:
    print("Please place your raw CFPB dataset as 'complaints.csv' in the 'data/' directory.")
    df = None # Exit or handle gracefully

if df is not None:
    print("Dataset loaded successfully.")
    print("\nInitial Data Info:")
    df.info()

    # --- 2. Initial EDA ---
    print("\n--- Initial EDA ---")

    # Analyze complaint distribution by product
    product_counts = df['Product'].value_counts()
    print("\nTop 10 Products by Complaint Count:")
    print(product_counts.head(10))

    plt.figure(figsize=(10, 6))
    product_counts.head(10).plot(kind='bar')
    plt.title('Distribution of Complaints by Product (Top 10)')
    plt.xlabel('Product')
    plt.ylabel('Number of Complaints')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    # Analyze narrative length
    df['narrative_word_count'] = df['Consumer complaint narrative'].astype(str).apply(lambda x: len(x.split()))
    print("\nNarrative Word Count Statistics:")
    print(df['narrative_word_count'].describe())

    plt.figure(figsize=(10, 6))
    sns.histplot(df['narrative_word_count'], bins=50, kde=True)
    plt.title('Distribution of Narrative Word Count')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    plt.show()

    # Identify complaints with and without narratives
    narrative_present = df['Consumer complaint narrative'].notna().sum()
    narrative_missing = df['Consumer complaint narrative'].isna().sum()
    print(f"\nComplaints with narratives: {narrative_present}")
    print(f"Complaints without narratives: {narrative_missing}")

    # --- 3. Data Filtering ---
    print("\n--- Filtering Data ---")
    
    # Define the list of target products
    target_products = [
        'Credit card', 
        'Personal loan', 
        'Buy Now, Pay Later (BNPL)', 
        'Savings account', 
        'Money transfers'
    ]

    # Filter for target products
    filtered_df = df[df['Product'].isin(target_products)].copy()
    print(f"Dataset filtered to {len(filtered_df)} records for target products.")
    
    # Remove records with empty narratives
    filtered_df = filtered_df.dropna(subset=['Consumer complaint narrative']).copy()
    print(f"Dataset after removing empty narratives: {len(filtered_df)} records.")

    # --- 4. Text Cleaning ---
    print("\n--- Cleaning Text Narratives ---")
    
    stop_words = set(stopwords.words('english'))

    def clean_text(text):
        if not isinstance(text, str):
            return ""
        # Lowercase the text
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove common boilerplate phrases (example)
        text = re.sub(r'i am writing to file a complaint', '', text, flags=re.I)
        # Tokenize and remove stopwords
        tokens = text.split()
        cleaned_tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(cleaned_tokens)

    filtered_df['cleaned_narrative'] = filtered_df['Consumer complaint narrative'].apply(clean_text)

    # --- 5. Save the cleaned dataset ---
    output_path = 'data/filtered_complaints.csv'
    filtered_df.to_csv(output_path, index=False)
    print(f"\nCleaned and filtered dataset saved to {output_path}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eep\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eep\AppData\Roaming\nltk_data...


Loading dataset...
Please place your raw CFPB dataset as 'complaints.csv' in the 'data/' directory.


[nltk_data]   Package punkt is already up-to-date!
