# 04. Building Intelligent RAG Systems

## 0. 安装依赖

In [93]:
%uv pip install jq~=1.10 langchain-chroma~=0.2 langchain-community~=0.3 langchain-core~=0.3 langchain-openai~=0.3.0 python-dotenv transformers~=4.56

[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K[2mResolved [1m118 packages[0m [2min 334ms[0m[0m                                       [0m
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2K[2mInstalled [1m19 packages[0m [2min 51ms[0m[0m=0.3.30                        [0m
 [32m+[39m [1maiohappyeyeballs[0m[2m==2.6.1[0m
 [32m+[39m [1maiohttp[0m[2m==3.12.15[0m
 [32m+[39m [1maiosignal[0m[2m==1.4.0[0m
 [32m+[39m [1mdataclasses-json[0m[2m==0.6.7[0m
 [32m+[39m [1mfrozenlist[0m[2m==1.7.0[0m
 [32m+[39m [1mgreenlet[0m[2m==3.2.4[0m
 [32m+[39m [1mhttpx-sse[0m[2m==0.4.1[0m
 [32m+[39m [1mjq[0m[2m==1.10.0[0m
 [32m+[39m [1mlangchain[0m[2m==0.3.27[0m
 [32m+[39m [1mlangchain-community[0m[2m==0.3.30[0m
 [32m+[39m [1mlangchain-text-splitters[0m[2m==0.3.11[0m
 [32m+[39m [1mmarshmallow[0m[2m==3.26.1[0m
 [32m+[39m [1mmultidict[0m[2m==6.6.4[0m
 [32m+[39m [1mmypy-extensions[0m[2m==1.1.0[0m
 [32m+

工具类

In [None]:
import os

import dotenv
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


class Config:
    def __init__(self):
        # By default, load_dotenv doesn't override existing environment variables and looks for a .env file in same directory as python script or searches for it incrementally higher up.
        dotenv_path = dotenv.find_dotenv(usecwd=True)
        if not dotenv_path:
            raise ValueError("No .env file found")
        dotenv.load_dotenv(dotenv_path=dotenv_path)

        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY is not set")

        base_url = os.getenv("OPENAI_API_BASE_URL")
        if not base_url:
            raise ValueError("OPENAI_API_BASE_URL is not set")

        model = os.getenv("OPENAI_MODEL")
        if not model:
            raise ValueError("OPENAI_MODEL is not set")

        vl_model = os.getenv("OPENAI_VL_MODEL")
        embeddings_model = os.getenv("OPENAI_EMBEDDINGS_MODEL")
        hf_pretrained_embeddings_model = os.getenv("HF_PRETRAINED_EMBEDDINGS_MODEL")

        self.api_key = api_key
        self.base_url = base_url
        self.model = model
        self.vl_model = vl_model
        self.embeddings_model = embeddings_model
        self.hf_pretrained_embeddings_model = hf_pretrained_embeddings_model if hf_pretrained_embeddings_model else 'Qwen/Qwen3-Embedding-8B'

    def new_openai_like(self, **kwargs) -> ChatOpenAI:
        # 参考：https://bailian.console.aliyun.com/?tab=api#/api/?type=model&url=2587654
        # 参考：https://help.aliyun.com/zh/model-studio/models
        # ChatOpenAI 文档参考：https://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html#langchain_openai.chat_models.base.ChatOpenAI
        return ChatOpenAI(
            api_key=self.api_key, base_url=self.base_url, model=self.model, **kwargs
        )

    def new_openai_like_embeddings(self, **kwargs) -> OpenAIEmbeddings:
        if not self.embeddings_model:
            raise ValueError("OPENAI_EMBEDDINGS_MODEL is not set")

        # 参考：https://python.langchain.com/api_reference/openai/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html#langchain_openai.embeddings.base.OpenAIEmbeddings
        return OpenAIEmbeddings(
            api_key=self.api_key,
            base_url=self.base_url,
            model=self.embeddings_model,
            # https://python.langchain.com/api_reference/openai/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html#langchain_openai.embeddings.base.OpenAIEmbeddings.tiktoken_enabled
            # 对于非 OpenAI 的官方实现，将这个参数置为 False。
            # 回退到用 huggingface transformers 库 AutoTokenizer 来处理 token。
            tiktoken_enabled=False,
            # https://python.langchain.com/api_reference/openai/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html#langchain_openai.embeddings.base.OpenAIEmbeddings.model
            # 元宝说 Jina 的 embedding 模型 https://huggingface.co/jinaai/jina-embeddings-v4 最接近
            # text-embedding-ada-002
            # 个人喜好，选了 Qwen/Qwen3-Embedding-8B
            # tiktoken_model_name='Qwen/Qwen3-Embedding-8B',
            tiktoken_model_name=self.hf_pretrained_embeddings_model,
            **kwargs
        )

    def new_openai_like_vl(self, **kwargs) -> ChatOpenAI:
        if not self.vl_model:
            raise ValueError("OPENAI_VL_MODEL is not set")

        # 参考：https://bailian.console.aliyun.com/?tab=api#/api/?type=model&url=2587654
        # 参考：https://help.aliyun.com/zh/model-studio/models
        # ChatOpenAI 文档参考：https://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html#langchain_openai.chat_models.base.ChatOpenAI
        return ChatOpenAI(
            api_key=self.api_key, base_url=self.base_url, model=self.vl_model, **kwargs
        )

## From indexes to intelligent retrieval

## Components of a RAG system

### When to implement RAG

## From embeddings to search

### Embeddings

In [None]:
# Initialize the embeddings model
embeddings_model = Config().new_openai_like_embeddings()

# Create embeddings for example sentences
text1 = "The cat sat on the mat"
text2 = "A feline rested on the carpet"
text3 = "Python is a programming language"

# Get embeddings using LangChain
embeddings = embeddings_model.embed_documents([text1, text2, text3])

# These similar sentences will have similar embeddings
embedding1 = embeddings[0]  # Embedding for "The cat sat on the mat"
embedding2 = embeddings[1]  # Embedding for "A feline rested on the carpet"
embedding3 = embeddings[2]  # Embedding for "Python is a programming language"

# Output shows number of documents and embedding dimensions
print(f"Number of documents: {len(embeddings)}")
print(f"Dimensions per embedding: {len(embeddings[0])}")
# Typically 1536 dimensions with OpenAI's embeddings

### Vector stores

#### Vector stores comparison

#### Hardware considerations for vector stores

#### Vector store interface in LangChain

In [None]:
# from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

# Initialize with an embedding model
# embeddings = OpenAIEmbeddings()
embeddings = Config().new_openai_like_embeddings()

# Create some sample documents with explicit IDs
docs = [
    Document(page_content="Content about language models", metadata={"id": "doc_1"}),
    Document(page_content="Information about vector databases", metadata={"id": "doc_2"}),
    Document(page_content="Details about retrieval systems", metadata={"id": "doc_3"})
]

# Create the vector store
vector_store = Chroma(embedding_function=embeddings)

# Add documents with explicit IDs
vector_store.add_documents(docs)

# Similarity Search with appropriate k value
results = vector_store.similarity_search("How do language models work?", k=2)

# For MMR, adjust the parameters based on available documents
found_docs = vector_store.similarity_search("retrieval", k=1)
print(f"Found documents: {len(found_docs)}")

### Vector indexing strategies

faiss 库不支持 python3.12。google 的 ScaNN 库没找到接口文档。
TODO：用 ScaNN 复现书中代码。

## Breaking down the RAG pipeline

1. Load documents

In [None]:
from langchain_community.document_loaders import JSONLoader

# Load a json file
loader = JSONLoader(
    file_path="static/knowledge_base.json",
    jq_schema=".[].content",  # This extracts the content field from each array item
    text_content=True
)
documents = loader.load()

print(documents)

2. Make embedding model

In [None]:
embedder = Config().new_openai_like_embeddings()

3. Store in vector database

In [None]:
from langchain_community.vectorstores import ScaNN

vector_db = ScaNN.from_documents(documents, embedder)

4. Retrieve similar docs

In [None]:
query = "What are the effects of climate change?"

vector_db.similarity_search(query)

### Document processing

In [None]:
from langchain_community.document_loaders import JSONLoader

# Load a json file
loader = JSONLoader(
    file_path="static/knowledge_base.json",
    jq_schema=".[].content",  # This extracts the content field from each array item
    text_content=True
)
documents = loader.load()

print(documents)

#### Chunking strategies

##### Fixed-size chunking

In [None]:
%uv pip install langchain-text-splitters~=0.3

In [None]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator=" ", # Split on spaces to avoid breaking words
    chunk_size=200,
    chunk_overlap=20
)

chunks = text_splitter.split_documents(documents)
print(f"Generated {len(chunks)} chunks from document")

##### Recursive character chunking

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=150,
    chunk_overlap=20
)

document = """# Introduction to RAG
Retrieval-Augmented Generation (RAG) combines retrieval systems with generative AI models.

It helps address hallucinations by grounding responses in retrieved information.

## Key Components
RAG consists of several components:
1. Document processing
2. Vector embedding
3. Retrieval
4. Augmentation
5. Generation

### Document Processing
This step involves loading and chunking documents appropriately.
"""

text_splitter.split_text(document)

##### Document-specific chunking

##### Semantic chunking

In [None]:
%uv pip install langchain-experimental~=0.3

In [None]:
from langchain_experimental.text_splitter import SemanticChunker

embeddings = Config().new_openai_like_embeddings()
text_splitter = SemanticChunker(
    embeddings=embeddings,
    add_start_index=True  # Include position metadata
)

text_splitter.split_text(document)

##### Agent-based chunking

##### Multi-modal chunking

##### Choosing the right chunking strategy

#### Retrieval

##### LangChain retrievers

##### Vector store retrievers

In [None]:
from langchain_community.retrievers import KNNRetriever

embeddings = Config().new_openai_like_embeddings()

retriever = KNNRetriever.from_documents(documents, embeddings)
retriever.invoke("query")

In [None]:
%uv pip install xmltodict~=1.0

In [None]:
from langchain_community.retrievers.pubmed import PubMedRetriever

retriever = PubMedRetriever()
# FIXME: 没有跑通
results = retriever.invoke("chatgpt")

### Advanced RAG techniques

#### Hybrid retrieval: Combining semantic and keyword search

In [None]:
%uv pip install rank-bm25~=0.2

In [None]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

# Setup semantic retriever
vector_retriever = vector_store.as_retriever(search_kwargs={"k": 5})

# Setup lexical retriever
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 5

# Combine retrievers
hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.7, 0.3],  # Weight semantic search higher than keyword search
)

hybrid_retriever.get_relevant_documents("climate change impacts")

#### Re-ranking

#### Query transformation: Improving retrieval through better queries

In [None]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

expansion_template = """Given the user question: {question}
Generate three alternative versions that express the same information need but with different wording:
1."""

expansion_prompt = PromptTemplate(
    input_variables=["question"],
    template=expansion_template
)

llm = Config().new_openai_like(temperature=0.7)
expansion_chain = expansion_prompt | llm | StrOutputParser()

# Generate expanded queries
original_query = "What are the effects of climate change?"
reply = expansion_chain.invoke(original_query)
print(reply)

##### Hypothetical Document Embeddings (HyDE)

In [None]:
from langchain.prompts import PromptTemplate
# from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Create prompt for generating hypothetical document
hyde_template = """Based on the question: {question}
Write a passage that could contain the answer to this question:"""

hyde_prompt = PromptTemplate(
    input_variables=["question"],
    template=hyde_template
)
llm = Config().new_openai_like(temperature=0.2)
hyde_chain = hyde_prompt | llm | StrOutputParser()

# Generate hypothetical document
query = "What dietary changes can reduce carbon footprint?"
hypothetical_doc = hyde_chain.invoke(query)

# Use the hypothetical document for retrieval
embeddings = Config().new_openai_like_embeddings()
embedded_query = embeddings.embed_query(hypothetical_doc)
vector_db.similarity_search_by_vector(embedded_query, k=3)

#### Context processing: maximizing retrieved information value

##### Contextual compression

In [None]:
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
# from langchain_openai import ChatOpenAI

llm = Config().new_openai_like(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

# Create a basic retriever from the vector store
base_retriever = vector_db.as_retriever(search_kwargs={"k": 3})

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=base_retriever
)

compression_retriever.invoke("How do transformers work?")

##### Maximum marginal relevance

In [None]:
from langchain_community.vectorstores import ScaNN

vector_store = ScaNN.from_documents(documents, embeddings)

# FIXME: max_marginal_relevance_search is not implemented in ScaNN
vector_store.max_marginal_relevance_search(
    query="What are transformer models?",
    k=5, # Number of documents to return
    fetch_k=20, # Number of documents to initially fetch
    lambda_mult=0.5 # Diversity parameter (0 = max diversity, 1 = max relevance)
)

#### Response enhancement: Improving generator output

In [None]:
from langchain_core.documents import Document

# Example documents
documents = [
    Document(
        page_content="The transformer architecture was introduced in the paper 'Attention is All You Need' by Vaswani et al. in 2017.",
        metadata={"source": "Neural Network Review 2021", "page": 42}
    ),
    Document(
        page_content="BERT uses bidirectional training of the Transformer, masked language modeling, and next sentence prediction tasks.",
        metadata={"source": "Introduction to NLP", "page": 137}
    ),
    Document(
        page_content="GPT models are autoregressive transformers that predict the next token based on previous tokens.",
        metadata={"source": "Large Language Models Survey", "page": 89}
    )
]

##### Source attribution

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import ScaNN


# Create a vector store and retriever
embeddings = Config().new_openai_like_embeddings()
vector_store = ScaNN.from_documents(documents, embeddings)
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# Source attribution prompt template
attribution_prompt = ChatPromptTemplate.from_template("""
You are a precise AI assistant that provides well-sourced information.
Answer the following question based ONLY on the provided sources. For each fact or claim in your answer,
include a citation using [1], [2], etc. that refers to the source. Include a numbered reference list at the end.

Question: {question}

Sources:
{sources}

Your answer:
""")

In [None]:
from langchain_core.output_parsers import StrOutputParser


# Create a source-formatted string from documents
def format_sources_with_citations(docs):
    formatted_sources = []
    for i, doc in enumerate(docs, 1):
        source_info = f"[{i}] {doc.metadata.get('source', 'Unknown source')}"
        if doc.metadata.get('page'):
            source_info += f", page {doc.metadata['page']}"
        formatted_sources.append(f"{source_info}\n{doc.page_content}")
    return "\n\n".join(formatted_sources)

# Build the RAG chain with source attribution
def generate_attributed_response(question):
    # Retrieve relevant documents
    retrieved_docs = retriever.invoke(question)
    
    # Format sources with citation numbers
    sources_formatted = format_sources_with_citations(retrieved_docs)
    
    # Create the attribution chain using LCEL
    attribution_chain = (
        attribution_prompt
        | Config().new_openai_like(temperature=0)
        | StrOutputParser()
    )
    
    # Generate the response with citations
    response = attribution_chain.invoke({
        "question": question,
        "sources": sources_formatted
    })
    
    return response

In [None]:
# Example usage
question = "How do transformer models work and what are some examples?"
attributed_answer = generate_attributed_response(question)
print(attributed_answer)

##### Self-consistency checking: ensuring factual accuracy

In [None]:
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI


def verify_response_accuracy(
    retrieved_docs: list[Document],
    generated_answer: str,
    llm: ChatOpenAI | None = None
) -> str:
    """
    Verify if a generated answer is fully supported by the retrieved documents.
    Args:
        retrieved_docs: List of documents used to generate the answer
        generated_answer: The answer produced by the RAG system
        llm: Language model to use for verification
    Returns:
        Dictionary containing verification results and any identified issues
    """
    if llm is None:
        # llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
        llm = Config().new_openai_like(temperature=0)
        
    # Create context from retrieved documents
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    
    # Define verification prompt - fixed to avoid JSON formatting issues in the template
    verification_prompt = ChatPromptTemplate.from_template("""
    As a fact-checking assistant, verify whether the following answer is fully supported
    by the provided context. Identify any statements that are not supported or contradict the context.
    
    Context:
    {context}
    
    Answer to verify:
    {answer}
    
    Perform a detailed analysis with the following structure:
    1. List any factual claims in the answer
    2. For each claim, indicate whether it is:
       - Fully supported (provide the supporting text from context)
       - Partially supported (explain what parts lack support)
       - Contradicted (identify the contradiction)
       - Not mentioned in context
    3. Overall assessment: Is the answer fully grounded in the context?
    
    Return your analysis in JSON format with the following structure:
    {{
      "claims": [
        {{
          "claim": "The factual claim",
          "status": "fully_supported|partially_supported|contradicted|not_mentioned",
          "evidence": "Supporting or contradicting text from context",
          "explanation": "Your explanation"
        }}
      ],
      "fully_grounded": true|false,
      "issues_identified": ["List any specific issues"]
    }}
    """)
    
    # Create verification chain using LCEL
    verification_chain = (
        verification_prompt
        | llm
        | StrOutputParser()
    )
    
    # Run verification
    result = verification_chain.invoke({
        "context": context,
        "answer": generated_answer
    })
    
    return result

In [None]:
# Example usage
retrieved_docs = [
    Document(page_content="The transformer architecture was introduced in the paper 'Attention Is All You Need' by Vaswani et al. in 2017. It relies on self-attention mechanisms instead of recurrent or convolutional neural networks."),
    Document(page_content="BERT is a transformer-based model developed by Google that uses masked language modeling and next sentence prediction as pre-training objectives.")
]

generated_answer = "The transformer architecture was introduced by OpenAI in 2018 and uses recurrent neural networks. BERT is a transformer model developed by Google."

verification_result = verify_response_accuracy(retrieved_docs, generated_answer)
print(verification_result)

#### Corrective RAG
缺失完整示例代码

#### Agentic RAG

#### Choosing the right techniques

## Developing a corporate documentation chatbot

源码参见 src/chapter04/developing-a-corporate-documentation-chatbot 目录。

In [95]:
%uv pip install langgraph~=0.6 streamlit~=1.50

[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K[2mResolved [1m60 packages[0m [2min 87ms[0m[0m                                         [0m
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2K[2mInstalled [1m6 packages[0m [2min 6ms[0m[0m                                 [0m
 [32m+[39m [1mlanggraph[0m[2m==0.6.7[0m
 [32m+[39m [1mlanggraph-checkpoint[0m[2m==2.1.1[0m
 [32m+[39m [1mlanggraph-prebuilt[0m[2m==0.6.4[0m
 [32m+[39m [1mlanggraph-sdk[0m[2m==0.2.9[0m
 [32m+[39m [1mormsgpack[0m[2m==1.10.0[0m
 [32m+[39m [1mxxhash[0m[2m==3.5.0[0m
Note: you may need to restart the kernel to use updated packages.


In [96]:
!.venv/bin/streamlit run src/chapter04/developing-a-corporate-documentation-chatbot/streamlit_app.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.17.0.8:8501[0m
[0m
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
  _warn_about_sha1_encoder()
[34m  Stopping...[0m
^C


## Troubleshooting RAG systems