In [4]:
import chromadb
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from pathlib import Path
import numpy as np
import faiss
import pickle
import os

In [5]:
# Recreate the same chunking logic from chunking.ipynb
py_file = Path("./docs/gmail_pytest_suite.py")
source = py_file.read_text(encoding="utf-8")

# Use the same language-aware splitter configuration
splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=500,
    chunk_overlap=50,
)

# Create lan_chunks (same as chunking notebook)
lan_chunks = splitter.split_text(source)

# Import the same embeddings model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print(f"   chunks (lan_chunks): {len(lan_chunks)}")

# Store for use in vector databases
documents = lan_chunks

  from .autonotebook import tqdm as notebook_tqdm


   chunks (lan_chunks): 150


# ChromaDB Implementation

ChromaDB provides an easy-to-use vector database that integrates seamlessly with LangChain. Perfect for prototyping and local development.

In [8]:


# Create data directory
os.makedirs("./data/chromadb", exist_ok=True)

# Initialize ChromaDB client with persistent storage
chroma_client = chromadb.PersistentClient(path="./data/chromadb")

# Create or get collection for language-aware chunks
collection_name = "language_chunks"
try:
    # Delete existing collection if it exists (for clean restart)
    chroma_client.delete_collection(name=collection_name)
except:
    pass

# Create new collection
collection = chroma_client.create_collection(name=collection_name)

# Create LangChain wrapper for seamless integration
vectorstore = Chroma(
    client=chroma_client,
    collection_name=collection_name,
    embedding_function=embeddings,
)

print(f"   Collection: {collection_name}")

# Create metadata for each chunk
metadatas = []
for i, doc in enumerate(documents):
    metadatas.append({
        "chunk_id": i,
        "chunk_size": len(doc),
        "chunking_method": "language_aware",
        "source": "gmail_pytest_suite.py",
        "chunk_type": "python_code"
    })

# Add documents to the vector store
vectorstore.add_texts(
    texts=documents,
    metadatas=metadatas,
    ids=[f"lang_chunk_{i}" for i in range(len(documents))]
)

print(f"✅ Successfully stored {len(documents)} language-aware chunks in ChromaDB!")


   Collection: language_chunks
✅ Successfully stored 150 language-aware chunks in ChromaDB!
✅ Successfully stored 150 language-aware chunks in ChromaDB!


In [1]:


# Test similarity search
print("\n🔍 Testing ChromaDB similarity search...")
query = "test login with valid credentials"
results = vectorstore.similarity_search(query, k=3)

print(f"Query: '{query}'")
print(f"Found {len(results)} similar documents:")
for i, result in enumerate(results, 1):
    print(f"\n--- ChromaDB Result {i} ---")
    print(f"Content preview: {result.page_content[:150]}...")
    print(f"Metadata: {result.metadata}")

# Test similarity search with scores
print("\n📊 ChromaDB similarity search with scores...")
results_with_scores = vectorstore.similarity_search_with_score(query, k=3)

print(f"Results with similarity scores:")
for i, (doc, score) in enumerate(results_with_scores, 1):
    print(f"\n--- Result {i} (Score: {score:.4f}) ---")
    print(f"Preview: {doc.page_content[:500]}...")
    print(f"Chunk size: {doc.metadata['chunk_size']} chars")


🔍 Testing ChromaDB similarity search...


NameError: name 'vectorstore' is not defined

# FAISS Implementation

FAISS (Facebook AI Similarity Search) provides high-performance vector similarity search. Excellent for production use cases requiring fast retrieval.

In [10]:


# Create data directory
os.makedirs("./data/faiss", exist_ok=True)

# Generate embeddings for all language-aware chunks
document_embeddings = embeddings.embed_documents(documents)
embeddings_array = np.array(document_embeddings, dtype='float32')

# Create FAISS index (using L2 distance for exact search)
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(embeddings_array)

print(f"   Total vectors: {index.ntotal}")
print(f"   Index dimension: {dimension}")

# Save FAISS index and document mapping
faiss_data_dir = "./data/faiss"

# Save the FAISS index
faiss.write_index(index, f"{faiss_data_dir}/language_chunks_index.faiss")

# Save document mapping and metadata
faiss_data = {
    'documents': documents,
    'metadatas': metadatas,
    'embeddings_array': embeddings_array,
    'chunking_method': 'language_aware'
}

with open(f"{faiss_data_dir}/language_chunks_data.pkl", "wb") as f:
    pickle.dump(faiss_data, f)


   Generated 150 embeddings
   Embedding dimension: 384
   Total vectors: 150
   Index dimension: 384


In [None]:
# FAISS similarity search (inline, no functions or print statements)
query = "test login with valid credentials"
query_embedding = embeddings.embed_query(query)
query_vector = np.asarray([query_embedding], dtype='float32')

distances, indices = index.search(query_vector, 3)

faiss_results = []
for dist, idx in zip(distances[0], indices[0]):
    if idx < 0:
        continue
    doc = documents[idx]
    meta = metadatas[idx]
    faiss_results.append({
        "index": int(idx),
        "distance": float(dist),
        "score": 1.0 / (1.0 + float(dist)),
        "preview": doc[:200],
        "chunk_size": meta.get("chunk_size")
    })

faiss_results

