In [None]:
!pip install chromadb

In [2]:
import pandas as pd
from pathlib import Path
import chromadb
from sentence_transformers import SentenceTransformer
import torch
import gc
import pickle
from google.colab import drive

In [3]:
# Mount Google Drive
drive.mount('/content/drive')

# Set paths
DATA_DIR = Path("/content/drive/MyDrive/filter")
VECTOR_STORE_DIR = Path("/content/drive/MyDrive/vector_store")
VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)
dataset_path = DATA_DIR / "filtered_complaints.csv"
progress_path = VECTOR_STORE_DIR / "progress.pkl"
vector_store_path = VECTOR_STORE_DIR / "chroma_db"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Simple text chunking function (renamed to avoid conflicts)
def chunk_narrative_text(text, chunk_size=500, overlap=100):
    if not text or pd.isna(text):
        return []
    chunks = []
    start = 0
    text_len = len(text)
    while start < text_len:
        end = min(start + chunk_size, text_len)
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# Initialize embedding model with GPU
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')
print(f"Embedding model initialized on cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embedding model initialized on cuda


In [5]:
# Initialize ChromaDB client
client = chromadb.PersistentClient(path=str(vector_store_path))
try:
    client.delete_collection("cfpb_complaints")
except:
    pass
collection = client.create_collection("cfpb_complaints")
print(f"Created ChromaDB collection: cfpb_complaints")

Created ChromaDB collection: cfpb_complaints


In [6]:
# Load progress if exists
if progress_path.exists():
    with open(progress_path, 'rb') as f:
        completed_chunks = pickle.load(f)
else:
    completed_chunks = set()

# Process subset of dataset (50,000 rows)
CHUNK_SIZE = 1000
SUBSET_ROWS = 50000
BATCH_SIZE = 64
total_chunks = 0
chunk_distribution = {}

print(f"Loading subset of dataset from {dataset_path} (first {SUBSET_ROWS} rows)")

Loading subset of dataset from /content/drive/MyDrive/filter/filtered_complaints.csv (first 50000 rows)


In [7]:
try:
    for chunk_idx, chunk in enumerate(pd.read_csv(dataset_path, chunksize=CHUNK_SIZE, nrows=SUBSET_ROWS)):
        if chunk_idx in completed_chunks:
            print(f"⏭️ Skipping chunk {chunk_idx + 1}")
            continue

        chunk = chunk.dropna(subset=['cleaned_narrative'])

        all_documents = []
        all_metadatas = []
        all_ids = []

        for row_idx, row in chunk.iterrows():
            narrative = row['cleaned_narrative']
            chunks = chunk_narrative_text(narrative)
            metadata = {
                'complaint_id': str(row['Complaint ID']),
                'product_category': str(row['ProductCategory']),
                'issue': str(row['Issue'])[:100],
                'company': str(row['Company']),
                'date_received': str(row['Date received']),
                'source': 'cfpb'
            }

            chunk_distribution[metadata['product_category']] = chunk_distribution.get(metadata['product_category'], 0) + len(chunks)

            for i, chunk_text in enumerate(chunks):
                all_documents.append(chunk_text)
                all_metadatas.append(metadata)
                all_ids.append(f"{row['Complaint ID']}_chunk_{i}")

        # Encode all documents in this chunk at once
        if all_documents:
            all_embeddings = model.encode(all_documents, batch_size=BATCH_SIZE, show_progress_bar=True)
            collection.add(
                embeddings=all_embeddings.tolist(),
                documents=all_documents,
                metadatas=all_metadatas,
                ids=all_ids
            )
            total_chunks += len(all_documents)

        print(f"Chunk {chunk_idx + 1}: ✅ Added {total_chunks} chunks total")

        completed_chunks.add(chunk_idx)
        with open(progress_path, 'wb') as f:
            pickle.dump(completed_chunks, f)

        # Clean up
        gc.collect()
        torch.cuda.empty_cache()

except Exception as e:
    print(f"❌ Error at chunk {chunk_idx + 1}: {e}")
    with open(progress_path, 'wb') as f:
        pickle.dump(completed_chunks, f)
    raise

# Final save
with open(VECTOR_STORE_DIR / 'chunk_distribution.csv', 'w') as f:
    for k, v in chunk_distribution.items():
        f.write(f"{k},{v}\n")

print(f"\n✅ Completed: Total chunks created and stored = {total_chunks}")
print("📊 Chunk Distribution by Product Category:", chunk_distribution)

⏭️ Skipping chunk 1
⏭️ Skipping chunk 2
⏭️ Skipping chunk 3
⏭️ Skipping chunk 4
⏭️ Skipping chunk 5
⏭️ Skipping chunk 6
⏭️ Skipping chunk 7
⏭️ Skipping chunk 8
⏭️ Skipping chunk 9
⏭️ Skipping chunk 10
⏭️ Skipping chunk 11
⏭️ Skipping chunk 12
⏭️ Skipping chunk 13
⏭️ Skipping chunk 14
⏭️ Skipping chunk 15
⏭️ Skipping chunk 16
⏭️ Skipping chunk 17
⏭️ Skipping chunk 18
⏭️ Skipping chunk 19
⏭️ Skipping chunk 20
⏭️ Skipping chunk 21
⏭️ Skipping chunk 22
⏭️ Skipping chunk 23
⏭️ Skipping chunk 24


Batches:   0%|          | 0/48 [00:00<?, ?it/s]

Chunk 25: ✅ Added 3014 chunks total


Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Chunk 26: ✅ Added 5685 chunks total


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

Chunk 27: ✅ Added 8594 chunks total


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Chunk 28: ✅ Added 11143 chunks total


Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Chunk 29: ✅ Added 13811 chunks total


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Chunk 30: ✅ Added 16409 chunks total


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Chunk 31: ✅ Added 18940 chunks total


Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Chunk 32: ✅ Added 21296 chunks total


Batches:   0%|          | 0/38 [00:00<?, ?it/s]

Chunk 33: ✅ Added 23713 chunks total


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Chunk 34: ✅ Added 26302 chunks total


Batches:   0%|          | 0/39 [00:00<?, ?it/s]

Chunk 35: ✅ Added 28743 chunks total


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

Chunk 36: ✅ Added 31675 chunks total


Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Chunk 37: ✅ Added 34392 chunks total


Batches:   0%|          | 0/49 [00:00<?, ?it/s]

Chunk 38: ✅ Added 37483 chunks total


Batches:   0%|          | 0/48 [00:00<?, ?it/s]

Chunk 39: ✅ Added 40546 chunks total


Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Chunk 40: ✅ Added 43213 chunks total


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Chunk 41: ✅ Added 45821 chunks total


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Chunk 42: ✅ Added 48806 chunks total


Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Chunk 43: ✅ Added 51546 chunks total


Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Chunk 44: ✅ Added 54219 chunks total


Batches:   0%|          | 0/43 [00:00<?, ?it/s]

Chunk 45: ✅ Added 56936 chunks total


Batches:   0%|          | 0/39 [00:00<?, ?it/s]

Chunk 46: ✅ Added 59402 chunks total


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Chunk 47: ✅ Added 61934 chunks total


Batches:   0%|          | 0/38 [00:00<?, ?it/s]

Chunk 48: ✅ Added 64311 chunks total


Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Chunk 49: ✅ Added 66657 chunks total


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

Chunk 50: ✅ Added 68915 chunks total

✅ Completed: Total chunks created and stored = 68915
📊 Chunk Distribution by Product Category: {'Savings Accounts': 21034, 'Credit Cards': 19257, 'Money Transfers': 25649, 'Personal Loans': 2975}
