In [None]:
# RAG Implementation - First Steps

This notebook demonstrates the foundational concepts of Retrieval-Augmented Generation (RAG):
1. Setting up embeddings and vector stores
2. Splitting documents into chunks
3. Storing and retrieving similar documents
4. Building a RAG chain with LLM

## Step 1: Install Required Packages

In [None]:
# Install all required LangChain packages
import subprocess
import sys

packages = [
    "langchain",
    "langchain-chroma",
    "langchain-openai",
    "langchain-core",
    "python-dotenv",
    "chromadb"
]

print("Installing required packages...\n")

for package in packages:
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
    print(f"✓ {package} installed successfully")

print("\n✓ All packages installed successfully!")
print("\nInstalled packages:")
for package in packages:
    print(f"  - {package}")

## Step 2: Import Required Libraries

In [None]:
# imports
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import CharacterTextSplitter

# load .env file
load_dotenv('../.env')

print("✓ All imports successful")

## Step 3: Initialize Embeddings & ChromaDB Vector Store

ChromaDB is an embedded vector database that stores document embeddings.
We'll use OpenAI's text-embedding-3-large model to create embeddings.

In [None]:
# Get Embeddings Model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Initialize ChromaDB as Vector Store
vector_store = Chroma(
    collection_name="test_collection",
    embedding_function=embeddings
)

print("✓ Embeddings model initialized: text-embedding-3-large")
print("✓ ChromaDB vector store created: test_collection")

## Step 4: Load and Split Documents

We'll read a document and split it into chunks.
Each chunk will be embedded and stored in the vector database.

In [None]:
# Read in State of the Union Address File
with open("2024_state_of_the_union.txt") as f:
    state_of_the_union = f.read()

print(f"✓ Document loaded")
print(f"  Total length: {len(state_of_the_union)} characters")

# Initialize Text Splitter
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

# Create Documents (Chunks) From File
texts = text_splitter.create_documents([state_of_the_union])

print(f"✓ Document split into {len(texts)} chunks")
print(f"  Chunk size: 1000 characters")
print(f"  Overlap: 200 characters")

# Save Document Chunks to Vector Store
ids = vector_store.add_documents(texts)

print(f"✓ {len(ids)} document chunks added to vector store")