In [None]:
# Load the cleaned data from Task 1
df_cleaned = pd.read_csv('data/processed/filtered_complaints.csv')

# Create a stratified sample of 15,000 complaints
sample_size = 15000
df_sample = df_cleaned.groupby('product_category', group_keys=False).apply(
    lambda x: x.sample(n=int(len(x)/len(df_cleaned) * sample_size), random_state=42)
)

print("Sample distribution per product:")
print(df_sample['product_category'].value_counts())

Text Chunking Strategy

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# We use 500 characters with 50 character overlap
# Overlap ensures that if a sentence is cut in half, the context is preserved in both chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

chunks = []
for _, row in df_sample.iterrows():
    # Split the individual complaint
    texts = text_splitter.split_text(row['cleaned_narrative'])
    
    # Create Document objects with metadata for tracing
    for i, text in enumerate(texts):
        chunks.append(Document(
            page_content=text,
            metadata={
                "complaint_id": row.get('Complaint ID', 'N/A'),
                "product": row['product_category'],
                "issue": row.get('Issue', 'N/A'),
                "chunk_index": i
            }
        ))

print(f"Created {len(chunks)} chunks from {len(df_sample)} complaints.")

Embedding and Vector Store (ChromaDB)

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# Initialize the embedding model
# all-MiniLM-L6-v2 is fast, lightweight (80MB), and perfect for 384-dimension vectors
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create and Persist the Vector Store
vector_db_path = "vector_store/chroma_db"

print("Indexing chunks... this may take 5-10 minutes depending on your CPU.")
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=vector_db_path
)

print(f"Vector store saved to {vector_db_path}")

Test Retrieval

In [None]:
query = "I am having trouble with unauthorized charges on my credit card"
docs = vector_db.similarity_search(query, k=3)

print(f"Query: {query}\n")
for i, doc in enumerate(docs):
    print(f"--- Result {i+1} (Product: {doc.metadata['product']}) ---")
    print(f"{doc.page_content[:200]}...\n")