# Lab 2: RAG

## We will build and evaluate a Question Answering Expert for a fictional company: InsureLLM!

### BEFORE WE BEGIN:

Look at the knowledge-base - this is the company shared drive.

### For those new to RAG:

Does one of the Experts want to give an explanation?

We will be figuring out ways to insert relevant background information in to the prompt..

Today will be more intense - please ask me lots of questions and clarifications..

In [None]:
import os
import glob
import tiktoken
import numpy as np
from IPython.display import Markdown, display

from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
MODEL = "gpt-4.1-nano"
db_name = "vector_db"

In [None]:
knowledge_base_path = "knowledge-base/**/*.md"
files = glob.glob(knowledge_base_path, recursive=True)
print(f"Found {len(files)} files in the knowledge base")

entire_knowledge_base = ""

for file_path in files:
    with open(file_path, 'r', encoding='utf-8') as f:
        entire_knowledge_base += f.read()
        entire_knowledge_base += "\n\n"  # Add separator between files

print(f"Total characters in knowledge base: {len(entire_knowledge_base):,}")

In [None]:
encoding = tiktoken.encoding_for_model("gpt-4.1-mini")
tokens = encoding.encode(entire_knowledge_base)
token_count = len(tokens)
print(f"Total tokens for gpt-4.1-mini: {token_count:,}")

In [None]:
folders = glob.glob("knowledge-base/*")

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'encoding': 'utf-8'})
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

print(f"Loaded {len(documents)} documents")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Divided into {len(chunks)} chunks")

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# embeddings = OpenAIEmbeddings()

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
display(Markdown(chunks[1].page_content))

In [None]:
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
# Gather the vectors, documents and metadata

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['source'].split('/')[1] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [None]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# how many chunks to provide in each prompt
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Simple prompt that includes chat history
prompt = ChatPromptTemplate.from_messages([
    ("system", "Answer the question based on the context:\n{context}"),
    ("human", "{input}")
])

# Create the chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# Invoke it
query = "Please explain what Insurellm is in a couple of sentences"
result = rag_chain.invoke({"input": query})
print(result["answer"])

## Now check out ingest.py

Then run at the terminal:

`uv run ingest.py`

In [None]:
!uv run ingest.py

## Now check out answer.py

In [None]:
from answer import fetch_context, answer_question

fetch_context("Who is Avery?")

In [None]:
result, chunks = await answer_question("Who is Avery?")
display(Markdown(result))

## Now check out app.py

In [None]:
!uv run app.py

## OK - Now it's time to EVALUATE!

### First check out tests.jsonl for all the questions

And see how it's loaded in test.py


In [None]:
from test import load_tests

test_data = load_tests()

print(len(test_data))
print(test_data[0])
print(test_data[10])



In [None]:
print(set(test.category for test in test_data))


## Now take a look at eval.py

In [None]:
from eval import evaluate_retrieval, evaluate_answer

evaluate_retrieval(test_data[1])

In [None]:
await evaluate_answer(test_data[1])

## AND FINALLY - all come together in a UI

In [None]:
!uv run evaluator.py

## Ideas for your experiments

### Quick wins

- Experiment with the encoder
- Experiment with chunking strategies

### Big change ideas

1. Pre-processing - use an LLM to rewrite (a) the chunks and/or (b) the questions / conversation history
2. Hierarchical RAG - summarize at different levels and do RAG over summaries
3. Tools!