In [1]:
import langchain
print(langchain.__version__)


0.3.26


In [4]:
import pandas as pd

df = pd.read_csv('../data/filtered_complaints.csv')
print(df.head())


  Date received      Product                                 Sub-product  \
0    2025-06-13  Credit card                           Store credit card   
1    2025-06-12  Credit card  General-purpose credit card or charge card   
2    2025-06-12  Credit card  General-purpose credit card or charge card   
3    2025-06-09  Credit card  General-purpose credit card or charge card   
4    2025-06-09  Credit card  General-purpose credit card or charge card   

                                             Issue  \
0                            Getting a credit card   
1               Other features, terms, or problems   
2             Incorrect information on your report   
3  Problem with a purchase shown on your statement   
4                     Problem when making payments   

                                           Sub-issue  \
0        Card opened without my consent or knowledge   
1                                      Other problem   
2                      Account information incorre

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Example config
chunk_size = 200  # Try 200-500 words
chunk_overlap = 50  # Small overlap to keep context

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

chunks = []

for idx, row in df.iterrows():
    doc = row['Consumer complaint narrative']
    complaint_id = row['Complaint ID']
    product = row['Product']

    # Split text
    splits = text_splitter.split_text(doc)

    for chunk in splits:
        chunks.append({
            'complaint_id': complaint_id,
            'product': product,
            'text': chunk
        })

# Convert to DataFrame for easy handling
chunks_df = pd.DataFrame(chunks)
print(chunks_df.head())


   complaint_id      product  \
0      14069121  Credit card   
1      14069121  Credit card   
2      14069121  Credit card   
3      14047085  Credit card   
4      14047085  Credit card   

                                                text  
0  A XXXX XXXX card was opened under my name by a...  
1  out to XXXX XXXX to state that this activity w...  
2  However, they have failed to remove this from ...  
3  Dear CFPB, I have a secured credit card with c...  
4  secured amount {$2500.00} for rhat credit card...  


In [7]:
from sentence_transformers import SentenceTransformer

# Popular small, fast, high-quality model:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Why? It’s lightweight, fast, and strong for short texts.


  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [8]:
texts = chunks_df['text'].tolist()
embeddings = model.encode(texts, show_progress_bar=True)

# Add to DataFrame
chunks_df['embedding'] = embeddings.tolist()


Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 20185/20185 [1:21:35<00:00,  4.12it/s]


In [9]:
import faiss
import numpy as np
import os

# Convert to numpy matrix
embedding_matrix = np.vstack(chunks_df['embedding'].values)

# Build FAISS index
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embedding_matrix)

# Save FAISS index + metadata
faiss.write_index(index, '../vector_store/complaints_index.faiss')

# Save metadata for lookup
chunks_df[['complaint_id', 'product', 'text']].to_csv('../vector_store/chunks_metadata.csv', index=False)
