In [1]:
import json
import chromadb
from sentence_transformers import SentenceTransformer
import os
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Load Chunked Data

First, let's load the chunks created by the ETL process.

In [2]:
# Load the chunks from the JSONL file
chunks_file = "../chunks.jsonl"
chunks = []

if os.path.exists(chunks_file):
    with open(chunks_file, 'r', encoding='utf-8') as f:
        for line in f:
            chunks.append(json.loads(line.strip()))
    print(f"Loaded {len(chunks)} chunks from {chunks_file}")
else:
    print(f"Chunks file not found: {chunks_file}")
    print("Please run the ETL notebook first to create the chunks.")

Loaded 18 chunks from ../chunks.jsonl


## Initialize Embedding Model

We'll use the BAAI/bge-base-en-v1.5 model for creating embeddings.

In [3]:
# Initialize the embedding model
print("Loading embedding model (this may take a few minutes on first run)...")
embedding_model = SentenceTransformer('BAAI/bge-base-en-v1.5')
print(" Embedding model loaded successfully")

Loading embedding model (this may take a few minutes on first run)...


 Embedding model loaded successfully


## Setup ChromaDB

Initialize ChromaDB and create a collection for our Shakespeare chunks.

In [4]:
# Initialize ChromaDB client
db_path = "../chroma_db"
client = chromadb.PersistentClient(path=db_path)

# Create or get collection
collection_name = "shakespeare_collection"

try:
    # Try to get existing collection first
    collection = client.get_collection(name=collection_name)
    print(f"Found existing collection: {collection_name}")
    
    # Check if collection has data
    count = collection.count()
    print(f"Collection currently has {count} documents")
    
    # Ask if user wants to recreate
    if count > 0:
        print("Collection already contains data. Deleting and recreating...")
        client.delete_collection(name=collection_name)
        
except:
    print(f"Collection {collection_name} not found, creating new one...")

# Create fresh collection
collection = client.create_collection(
    name=collection_name,
    metadata={"description": "Shakespeare Julius Caesar text chunks with embeddings"}
)
print(f" Created collection: {collection_name}")

Found existing collection: shakespeare_collection
Collection currently has 18 documents
Collection already contains data. Deleting and recreating...
 Created collection: shakespeare_collection


## Create Embeddings and Index

Now we'll create embeddings for each chunk and add them to ChromaDB.

In [5]:
# Process chunks in batches
batch_size = 10
documents = []
metadatas = []
ids = []

print("Creating embeddings and indexing chunks...")

for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
    # Extract text for embedding
    text = chunk['text']
    
    # Create metadata
    metadata = {
        'act': chunk['act'],
        'scene': chunk['scene'],
        'source': f"Act {chunk['act']}, Scene {chunk['scene']}"
    }
    
    # Create unique ID
    chunk_id = f"act_{chunk['act']}_scene_{chunk['scene']}"
    
    documents.append(text)
    metadatas.append(metadata)
    ids.append(chunk_id)

print(f"Prepared {len(documents)} documents for indexing")

Creating embeddings and indexing chunks...



Processing chunks:   0%|          | 0/18 [00:00<?, ?it/s]


Processing chunks: 100%|██████████| 18/18 [00:00<00:00, 114737.80it/s]

Prepared 18 documents for indexing





In [6]:
# Create embeddings for all documents
print("Generating embeddings...")
embeddings = embedding_model.encode(documents, show_progress_bar=True)
print(f"Generated {len(embeddings)} embeddings")

# Add to ChromaDB collection
print("Adding documents to ChromaDB...")
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
    embeddings=embeddings.tolist()
)

print(f" Successfully indexed {len(documents)} chunks in ChromaDB")
print(f"Collection now contains {collection.count()} documents")

Generating embeddings...



Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.23it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.21it/s]

Generated 18 embeddings
Adding documents to ChromaDB...
 Successfully indexed 18 chunks in ChromaDB
Collection now contains 18 documents





## Test Retrieval

Let's test our indexed vector store with some sample queries.

In [7]:
# Test queries
test_queries = [
    "What does Caesar say?",
    "Brutus and the conspiracy",
    "Antony's speech",
    "Beware the Ides of March"
]

print("Testing retrieval with sample queries...")

for query in test_queries:
    print(f"\n Query: '{query}'")
    
    # Generate embedding for the query using our model
    query_embedding = embedding_model.encode([query]).tolist()
    
    # Query the collection with the embedding
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=2
    )
    
    # Display results
    for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
        print(f"  Result {i+1}: {metadata['source']}")
        print(f"  Preview: {doc[:200]}...")
        print()

Testing retrieval with sample queries...

 Query: 'What does Caesar say?'
  Result 1: Act 3, Scene 1
  Preview: ACT 3
SCENE 1
Flourish. Enter Caesar, Antony, Lepidus; Brutus, Cassius,
Casca, Decius, Metellus, Trebonius, Cinna; Publius,
Popilius, Artemidorus, the Soothsayer, and other
Senators and Petitioners.
C...

  Result 2: Act 1, Scene 1
  Preview: ACT 1
SCENE 1
Enter Flavius, Marullus, and certain Commoners,
including a Carpenter and a Cobbler, over the stage.
FLAVIUS
Hence! Home, you idle creatures, get you home!
Is this a holiday? What, know ...


 Query: 'Brutus and the conspiracy'
  Result 1: Act 4, Scene 2
  Preview: ACT 4
SCENE 2
Drum. Enter Brutus, Lucilius, Lucius, and the Army.
Titinius and Pindarus meet them.
BRUTUS Stand ho!
LUCILIUS Give the word, ho, and stand!
BRUTUS
What now, Lucilius, is Cassius near?
L...

  Result 2: Act 3, Scene 2
  Preview: ACT 3
SCENE 2
Enter Brutus and Cassius with the Plebeians.
PLEBEIANS
We will be satisfied! Let us be satisfied!
BRUTUS
The

In [8]:
# Final summary
print(" Indexing Complete!")
print(f" Total documents indexed: {collection.count()}")
print(f" Database location: {db_path}")
print(f" Collection name: {collection_name}")
print("\n Your vector store is ready for the RAG system!")

 Indexing Complete!
 Total documents indexed: 18
 Database location: ../chroma_db
 Collection name: shakespeare_collection

 Your vector store is ready for the RAG system!
