### Pick a document

Can be CSV, PDF, TXT, etc. or a website

#### PDF

In [None]:
from langchain_community.document_loaders import PyPDFLoader

pdf_path = "context/Weller et al. - 2025 - On the Theoretical Limitations of Embedding-Based Retrieval.pdf"

loader = PyPDFLoader(
    file_path=pdf_path,
    mode="single"
)
docs = loader.load()

print(f"Loaded PDF as {len(docs):d} document(s)")
print("-"*50)
print("Metadata:")
for k, v in docs[0].metadata.items():
    print(f"{k}: {v}")
print("-"*50)
print("Page content (preview):")
print(docs[0].page_content[:500] + "...")

#### Wikipedia

In [None]:
from langchain_community.document_loaders import WikipediaLoader

loader = WikipediaLoader(
    query="What is the capital of France?",
    load_max_docs=10
)
docs = loader.load()

print(f"Loaded {len(docs):d} document(s)")
print("-"*50)
for doc in docs:
    print(doc.metadata)

#### Custom Document

In [None]:
from langchain_core.documents import Document

page_content = "Hello, world!"

docs = []
doc = Document(
    page_content=page_content,
    metadata={
        "source": "My custom document",
        "title": "Title of my custom document"
    }
)
docs.append(doc)

print("-"*50)
print("Metadata:")
for k, v in docs[0].metadata.items():
    print(f"{k}: {v}")
print("-"*50)
print("Page content (preview):")
print(docs[0].page_content if len(docs[0].page_content) < 500 else docs[0].page_content[:500] + "...")

#### Website

Caveat: Different websites have different structures and require different parsing strategies.

In [None]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
import os
import re
os.environ['USER_AGENT'] = ("Demo")

"""
loader = WebBaseLoader(
    web_paths=("https://www.pff.com/news/nfl-scores-and-recaps-for-every-week-3-game",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer("article")
    ),
)
"""

loader = WebBaseLoader(
    web_paths=("https://www.cbssports.com/nfl/news/nfl-week-3-grades-scores-results-highlights-browns-packers-vikings-bengals/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer("article")
    ),
)

docs = loader.load()

# Clean and condense page content: strip, remove blank lines, trim lines, and join into a single line
docs[0].page_content = " ".join(
    line.strip() for line in docs[0].page_content.strip().splitlines() if line.strip()
)

# Replace sequences of two or more spaces with a single space
docs[0].page_content = re.sub(r'\s{2,}', ' ', docs[0].page_content)

print(f"Loaded Website as {len(docs):d} document(s)")
print("-"*50)
print("Metadata:")
for k, v in docs[0].metadata.items():
    print(f"{k}: {v}")
print("-"*50)
print("Page content (preview):")
print(docs[0].page_content if len(docs[0].page_content) < 10000 else docs[0].page_content[:10000] + "...")

### Chunking

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Split document into chunks for vector storage
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    add_start_index=True,
)
doc_chunks = text_splitter.split_documents(docs)

print(f"Document split into {len(doc_chunks):d} chunks.")

# Show first few chunks for verification
for index, chunk in enumerate(doc_chunks[:3]):
    print(f"\n\nChunk {index+1}")
    print("="*50)
    print("Metadata:")
    for k, v in chunk.metadata.items():
        print(f"{k}: {v}")
    print("-"*50)
    print("Page content:")
    print(chunk.page_content if len(chunk.page_content) < 500 else chunk.page_content[:500] + "...")

### Embed documents

In [None]:
#embedding_model_name = "intfloat/multilingual-e5-base"
#embedding_model_name = "Qwen/Qwen2.5-0.5B"
#embedding_model_name = "Qwen/Qwen3-Embedding-0.6B"
embedding_model_name = "Qwen/Qwen3-Embedding-4B"
#embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

from langchain_huggingface import HuggingFaceEmbeddings

embedding_model_name = embedding_model_name

embedding_function = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embedding=embedding_function)

# Add documents to vector store
document_chunk_ids = vector_store.add_documents(documents=doc_chunks)

print(f"Added {len(document_chunk_ids):d} documents to the vector store")

In [None]:
# Inspect vector store
n_chunks = 5
for index, (id, doc) in enumerate(vector_store.store.items()):
    if index < n_chunks:
        print(f"Chunk {index+1}")
        print("-"*50)
        print(f"id: {id}")
        print(f"vector (length: {len(doc['vector'])}): {doc['vector']}")
        print(f"metadata: {doc['metadata']}")
        print(f"text:\n{doc['text'] if len(doc['text']) < 100 else doc['text'][:100] + '...'}\n\n")
    else:
        break

In [None]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

vector_store_path = "vector_store"

embedding_dim = len(embedding_function.embed_query("test"))

index = faiss.IndexFlatL2(embedding_dim)
vector_store = FAISS(
        embedding_function=embedding_function,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
    )

# Add documents to vector store
document_chunk_ids = vector_store.add_documents(documents=doc_chunks)
print(f"Added {len(document_chunk_ids):d} documents to the vector store")

vector_store.save_local(vector_store_path)

vector_store = FAISS.load_local(
    vector_store_path, 
    embeddings=embedding_function,
    allow_dangerous_deserialization=True
)

In [None]:
# Inspect FAISS vector store
n_chunks = 5
print(f"Total documents in FAISS vector store: {vector_store.index.ntotal}")
print(f"Vector dimension: {vector_store.index.d}")
print("-"*50)

# Get document IDs from the index_to_docstore_id mapping
docstore_ids = list(vector_store.index_to_docstore_id.values())[:n_chunks]

for index, doc_id in enumerate(docstore_ids):
    doc = vector_store.docstore.search(doc_id)
    print(f"Chunk {index+1}")
    print("-"*50)
    print(f"Document ID: {doc_id}")
    print(f"Metadata: {doc.metadata}")
    print(f"Text:\n{doc.page_content if len(doc.page_content) < 100 else doc.page_content[:100] + '...'}\n\n")
    if index >= n_chunks - 1:
        break

### Test vector store

In [None]:
# Test vector store with a sample query
query = "Who was the Vikings' starting quarterback in week 3?"

top_k = 5
similar_document_chunks = vector_store.similarity_search_with_score(query, k=top_k)

print(f"List of {len(similar_document_chunks):d} most similar document chunks for query: '{query:s}'")

for i, (doc, score) in enumerate(similar_document_chunks):
    if i < top_k:
        print(f"Chunk {index+1}")
        print("-"*50)
        print(f"id: {id}")
        print(f"vector (length: {len(doc['vector'])}): {doc['vector']}")
        print(f"metadata: {doc['metadata']}")
        print(f"text:\n{doc['text'] if len(doc['text']) < 100 else doc['text'][:100] + '...'}\n\n")
    else:
        break

### Set up RAG query

In [None]:
from langchain_core.prompts import ChatPromptTemplate

# Create prompt template for RAG system
chat_prompt = ChatPromptTemplate.from_messages([
    ("system",
    """
    You are an AI assistant that answers questions based on provided context documents. 
    """
    ),
    ("human",
    """
    Answer the question based on the context.

    CRITICAL RULES:
    - Answer concisely
    - Use information from the provided context
    - If the context doesn't contain enough information, state this clearly
    - Cite specific details from the context when possible

    CONTEXT:
    {context}

    QUESTION: {input}

    ANSWER:
    """
    )
])

### Load LLM

In [None]:
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name = "google/gemma-3-1b-it"
print(f"Loading llm <{model_name:s}>")

# Load llm
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create text generation pipeline
text_generation_pipeline = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=250,    # Limit answer length for concise RAG responses
    temperature=0.2,       # Low randomness, mostly deterministic
    top_p=0.95,            # Sample from top 95% probable tokens
    repetition_penalty=1.2 # Penalize repeated content to improve answer quality
)

# Wrap pipeline for LangChain
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

### The stuff documents chain

Takes your dummy_docs (like Peter's first name + Peter's last name)
Smooshes them all together into one big text

Uses your chat_prompt (the template that says "Hey AI, here's some context...")
Fills in the template with the smooshed-together documents

Sends everything to the llm (your AI friend)
Gets back a nice, smart answer

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain

combine_docs_chain = create_stuff_documents_chain(llm, chat_prompt)

retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 5,
        "score_threshold": 0.7,
    }
)

dummy_docs = [
    Document(
    page_content="My first name is Peter.",
    metadata={"doc-nr.": "1"}
    ),
    Document(
    page_content="My last name is Parker.",
    metadata={"doc-nr.": "2"}
    )
]

In [None]:
response = combine_docs_chain.invoke({"context": dummy_docs, "input": "What is the person's full name?"})
print(f"Response:")
print("-"*50)
print(f"{response:s}")

### Retrieval chain

We don't want ALL the context, just the most relevant parts

So we use the retriever to get the most relevant parts
Then we use the stuff documents chain to answer the question

In [None]:
from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

response = retrieval_chain.invoke({"input": "What is this document about?"})

In [None]:
print("ANSWER")
print("-"*50)
print(f"{response['answer']:s}")
print("\n\n")

print("CONTEXT")
print("-"*50)
for i, doc in enumerate(response['context']):
    print(f"Document {i+1}:")
    for key, value in doc.metadata.items():
        print(f"\t{key}: {value}")
    print(f"\tPage content: {doc.page_content if len(doc.page_content) < 300 else doc.page_content[:300] + '...'}")
    print("\n")
print("\n\n")

### Compare LLM vs RAG

In [None]:
question = "Who was the Vikings' starting quarterback in week 3?"
print(f"Question: {question}")
print("\n")

print("LLM without context:")
print("-"*50)
llm_response = llm.invoke(question)
print(f"Answer: {llm_response}")
print("\n\n")

print("RAG system with context:")
print("-"*50)
rag_response = retrieval_chain.invoke({"input": question})
print(f"Answer: {rag_response['answer']}")

print("Context:")
for i, doc in enumerate(rag_response['context']):
    print(f"Document {i+1}:")
    for key, value in doc.metadata.items():
        print(f"\t{key}: {value}")
    print(f"\tPage content: {doc.page_content if len(doc.page_content) < 300 else doc.page_content[:300] + '...'}")
    print("\n")
print("\n\n")