In [None]:
# RAG Implementation - First Steps

This notebook demonstrates the foundational concepts of Retrieval-Augmented Generation (RAG):
1. Setting up embeddings and vector stores
2. Splitting documents into chunks
3. Storing and retrieving similar documents
4. Building a RAG chain with LLM

## Step 1: Install Required Packages

In [None]:
# Install all required LangChain packages
import subprocess
import sys

packages = [
    "langchain",
    "langchain-chroma",
    "langchain-openai",
    "langchain-core",
    "python-dotenv",
    "chromadb"
]

print("Installing required packages...\n")

for package in packages:
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
    print(f"✓ {package} installed successfully")

print("\n✓ All packages installed successfully!")
print("\nInstalled packages:")
for package in packages:
    print(f"  - {package}")

## Step 2: Import Required Libraries

In [None]:
# imports
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import CharacterTextSplitter

# load .env file
load_dotenv('../.env')

print("✓ All imports successful")

## Step 3: Initialize Embeddings & ChromaDB Vector Store

ChromaDB is an embedded vector database that stores document embeddings.
We'll use OpenAI's text-embedding-3-large model to create embeddings.

In [None]:
# Get Embeddings Model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Initialize ChromaDB as Vector Store
vector_store = Chroma(
    collection_name="test_collection",
    embedding_function=embeddings
)

print("✓ Embeddings model initialized: text-embedding-3-large")
print("✓ ChromaDB vector store created: test_collection")

## Step 4: Load and Split Documents

We'll read a document and split it into chunks.
Each chunk will be embedded and stored in the vector database.

In [None]:
# Read in State of the Union Address File
with open("2024_state_of_the_union.txt") as f:
    state_of_the_union = f.read()

print(f"✓ Document loaded")
print(f"  Total length: {len(state_of_the_union)} characters")

# Initialize Text Splitter
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

# Create Documents (Chunks) From File
texts = text_splitter.create_documents([state_of_the_union])

print(f"✓ Document split into {len(texts)} chunks")
print(f"  Chunk size: 1000 characters")
print(f"  Overlap: 200 characters")

# Save Document Chunks to Vector Store
ids = vector_store.add_documents(texts)

print(f"✓ {len(ids)} document chunks added to vector store")

## Step 5: Test Semantic Similarity Search

Now we can search the vector store for semantically similar documents.

In [None]:
# Query the Vector Store
print("Search Query: 'Who invaded Ukraine?'\n")
results = vector_store.similarity_search(
    'Who invaded Ukraine?',
    k=2
)

print(f"✓ Retrieved {len(results)} most relevant chunks:\n")

# Print Resulting Chunks
for i, res in enumerate(results, 1):
    print(f"--- Result {i} ---")
    print(f"{res.page_content}")
    print(f"Metadata: {res.metadata}\n")

## Step 6: Build the RAG Pipeline

Now we create the complete RAG chain:
1. **Retriever**: Gets relevant documents from vector store
2. **Formatter**: Converts documents to string
3. **Prompt**: Templates the context and question
4. **LLM**: Generates answer based on context
5. **Output Parser**: Extracts the text response

In [None]:
# Step 1: Create Document Parsing Function
def format_docs(docs):
    """Convert list of documents to single formatted string"""
    return "\n\n".join(doc.page_content for doc in docs)

print("✓ Document formatter created")

In [None]:
# Step 2: Set Chroma as the Retriever
retriever = vector_store.as_retriever()

print("✓ Retriever created from vector store")

In [None]:
# Step 3: Initialize the LLM
llm = ChatOpenAI(model="gpt-4o-mini")

print("✓ LLM initialized: gpt-4o-mini")

In [None]:
# Step 4: Create the Prompt Template
prompt_template = """Use the context provided to answer the user's question below. If you do not know the answer based on the context provided, tell the user that you do not know the answer to their question based on the context provided and that you are sorry.

context: {context}

question: {query}

answer: """

# Create Prompt Instance from template
custom_rag_prompt = PromptTemplate.from_template(prompt_template)

print("✓ Prompt template created")

In [None]:
# Step 5: Create the RAG Chain
# This chains together: retriever -> formatter -> prompt -> LLM -> output parser
rag_chain = (
    {"context": retriever | format_docs, "query": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

print("✓ RAG chain created successfully!")
print("\nChain structure:")
print("  1. Retriever: Gets relevant docs from vector store")
print("  2. Formatter: Converts docs to string")
print("  3. Prompt: Combines context and query")
print("  4. LLM: Generates answer")
print("  5. Parser: Extracts text output")

## Step 7: Test the RAG Chain

Now test the RAG chain with queries related to the document content.

In [None]:
# Test Query 1: Question about document content
print("=" * 70)
print("TEST 1: Query Based on Document Content")
print("=" * 70)
print("\nQuery: According to the 2024 state of the union address, Who invaded Ukraine?\n")

response1 = rag_chain.invoke("According to the 2024 state of the union address, Who invaded Ukraine?")

print("Answer:")
print(response1)

In [None]:
# Test Query 2: Question NOT in document (tests "I don't know" behavior)
print("\n" + "=" * 70)
print("TEST 2: Query NOT Based on Document Content")
print("=" * 70)
print("\nQuery: What is the purpose of life?\n")

response2 = rag_chain.invoke("What is the purpose of life?")

print("Answer:")
print(response2)