# KCC QA Embedding Indexer
This notebook loads query-answer data, embeds it using MPNet, and stores it in a local ChromaDB collection.

In [1]:
import pandas as pd
from ftfy import fix_text
from sentence_transformers import SentenceTransformer
import chromadb

  from .autonotebook import tqdm as notebook_tqdm


## Load and Clean Data

In [2]:
# Load CSV
df = pd.read_csv('queries.csv', encoding='utf-8')

# Clean text fields
df['QueryText'] = df['QueryText'].apply(fix_text)
df['KccAns'] = df['KccAns'].apply(fix_text)

# Combine question + answer for embedding
texts = (df['QueryText'] + " " + df['KccAns']).tolist()

## Generate Embeddings using MPNet

In [3]:
print("Loading MPNet embedder...")
embedder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

print("Computing embeddings...")
embeddings = embedder.encode(texts, show_progress_bar=True)

Loading MPNet embedder...
Computing embeddings...


Batches: 100%|██████████| 292/292 [1:43:05<00:00, 21.18s/it]  


## Store Embeddings in ChromaDB

In [5]:
# Split the data into batches to avoid hitting the ChromaDB limit
BATCH_SIZE = 5000  # safe value under the 5461 limit

print("Upserting embeddings to ChromaDB in batches...")

for i in range(0, len(embeddings), BATCH_SIZE):
    batch_embeddings = embeddings[i:i + BATCH_SIZE].tolist()
    batch_ids = ids[i:i + BATCH_SIZE]
    batch_texts = texts[i:i + BATCH_SIZE]
    batch_metadata = [
        {'question': q, 'answer': a}
        for q, a in zip(df['QueryText'][i:i + BATCH_SIZE], df['KccAns'][i:i + BATCH_SIZE])
    ]

    collection.upsert(
        documents=batch_texts,
        embeddings=batch_embeddings,
        ids=batch_ids,
        metadatas=batch_metadata
    )

print("✅ All data indexed successfully!")


Upserting embeddings to ChromaDB in batches...
✅ All data indexed successfully!
