#Prepared by Tamal Acharya

In [None]:
# This tutorial demonstrates various Retrieval Augmented Generation (RAG) patterns
# using Python and open-source libraries.

# Ensure necessary libraries are installed
!pip install -q langchain-community langchain langchain-core chromadb transformers torch datasets

import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# --- Setup ---

# 1. Create a dummy text file for demonstration
dummy_text = """
Retrieval Augmented Generation (RAG) is a technique that enhances
the ability of language models to generate more accurate and informative
responses. It combines the power of pre-trained language models with
external knowledge retrieval systems.

The core idea is to first retrieve relevant documents or information chunks
from a knowledge base based on the user's query. These retrieved pieces of
information are then used as context for the language model to generate a
response. This approach helps mitigate the limitations of large language models,
such as hallucination and outdated information, by grounding the generation in
real-world data.

RAG has several benefits. It can provide more accurate and factual answers,
reduce hallucinations, and allow the model to access and utilize information
that was not present in its training data. It is particularly useful for
tasks requiring up-to-date information or domain-specific knowledge.

Common RAG patterns include Naive RAG, Advanced RAG (like Recursive Retrieval or
HyDE), and Agentic RAG. Each pattern offers different strategies for retrieval
and integration with the language model.
"""

with open("rag_info.txt", "w") as f:
    f.write(dummy_text)

# 2. Load the document
loader = TextLoader("rag_info.txt")
documents = loader.load()

# 3. Split the document into chunks
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

# 4. Initialize the Embedding Model
# We'll use a common sentence transformer model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 5. Initialize the Vector Store (ChromaDB in this case)
# This stores the embedded chunks and allows for efficient similarity search
db = Chroma.from_documents(texts, embeddings, persist_directory="./chroma_db")

# 6. Initialize the Language Model (using HuggingFace transformers pipeline)
# We'll use a small causal language model for demonstration purposes
model_name = "distilbert/distilgpt2" # A smaller model for faster execution
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device=0 if torch.cuda.is_available() else -1, # Use GPU if available
    max_new_tokens=100, # Limit generated tokens for faster results
    do_sample=True,
    top_k=50,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

llm = HuggingFacePipeline(pipeline=pipe)

# --- RAG Pattern 1: Naive RAG ---
# The most basic pattern: retrieve top-k documents and pass them directly
# as context to the language model.

print("\n--- Demonstrating Naive RAG ---")

# Create a retriever from the vector store
retriever = db.as_retriever(search_kwargs={"k": 2}) # Retrieve top 2 chunks

# Define the prompt template for RAG
# The prompt includes a placeholder for the retrieved context
template = """Use the following pieces of context to answer the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use a maximum of three sentences.

Context:
{context}

Question:
{question}

Helpful Answer:"""

prompt = ChatPromptTemplate.from_template(template)

# Create the RAG chain
# This chain combines retrieval, prompting, and language model inference
rag_chain = (
    {"context": retriever, "question": lambda x: x["question"]}
    | prompt
    | llm
)

# Ask a question
question = "What is Retrieval Augmented Generation?"
print(f"\nQuestion: {question}")

# Invoke the RAG chain
response = rag_chain.invoke({"question": question})

# The HuggingFace pipeline often includes the prompt in the output,
# we need to extract the generated text.
# This part might need adjustment based on the specific LLM and pipeline output format.
# A common pattern is to split by the 'Helpful Answer:' part.
generated_text = response.split("Helpful Answer:")[-1].strip()
print(f"Generated Answer: {generated_text}")

# --- RAG Pattern 2: Hypothetical Document Embeddings (HyDE) ---
# An Advanced RAG pattern. Instead of embedding the query directly,
# HyDE first generates a hypothetical answer to the query. Then, it
# embeds this hypothetical answer and uses it for retrieval.

print("\n--- Demonstrating HyDE RAG ---")

# Create a simpler LLM chain to generate the hypothetical answer
hyde_prompt_template = """Please write a concise, hypothetical answer to the following question:

Question: {question}

Hypothetical Answer:"""
hyde_prompt = ChatPromptTemplate.from_template(hyde_prompt_template)
hyde_chain = hyde_prompt | llm

# Define a custom retriever that uses HyDE
class HydeRetriever:
    def __init__(self, llm_chain, vectorstore, k=4):
        self.llm_chain = llm_chain
        self.vectorstore = vectorstore
        self.k = k

    def get_relevant_documents(self, query):
        # Generate hypothetical answer
        hypothetical_answer = self.llm_chain.invoke({"question": query})
        # Extract the generated text (similar to Naive RAG)
        # This might need adjustment based on the specific LLM output
        hypothetical_answer = hypothetical_answer.split("Hypothetical Answer:")[-1].strip()
        print(f"\nGenerated Hypothetical Answer for HyDE: {hypothetical_answer}")

        # Retrieve documents based on the hypothetical answer
        retrieved_docs = self.vectorstore.similarity_search(hypothetical_answer, k=self.k)
        return retrieved_docs

# Initialize the HyDE retriever
hyde_retriever = HydeRetriever(hyde_chain, db, k=2)

# Create the RAG chain using the HyDE retriever
hyde_rag_chain = (
    {"context": hyde_retriever.get_relevant_documents, "question": lambda x: x["question"]}
    | prompt # Use the same answer generation prompt as Naive RAG
    | llm
)

# Ask the same question
question_hyde = "What are the benefits of RAG?"
print(f"\nQuestion: {question_hyde}")

# Invoke the HyDE RAG chain
response_hyde = hyde_rag_chain.invoke({"question": question_hyde})

# Extract the generated text
generated_text_hyde = response_hyde.split("Helpful Answer:")[-1].strip()
print(f"Generated Answer (HyDE): {generated_text_hyde}")

# --- RAG Pattern 3: Agentic RAG (Conceptual) ---
# Agentic RAG involves an AI agent that dynamically decides whether to retrieve,
# which retrieval strategy to use, and how to integrate the retrieved information.
# Implementing a full agent is complex and often requires more sophisticated
# orchestrator frameworks (like LangGraph). This section provides a conceptual
# outline and a simplified example using Langchain's agent capabilities.

print("\n--- Conceptual Demonstrating Agentic RAG ---")

# For a full Agentic RAG implementation, you would typically:
# 1. Define tools the agent can use (e.g., a retrieval tool, a search engine tool, a calculator).
# 2. Define the agent's personality/instructions.
# 3. Use an agent executor framework (like Langchain's Agent Executor).

# Simplified conceptual example using a Langchain Agent (requires additional tools)
# !pip install -q wikipedia # Example tool if you want to try
# from langchain.agents import initialize_agent, AgentType, Tool

# We won't run this fully without more setup and tools, but here's the idea:
# agent_tools = [
#     Tool(
#         name="Vector Store Retriever",
#         func=db.as_retriever().get_relevant_documents,
#         description="Useful for retrieving information about RAG from the local knowledge base."
#     ),
#     # Add other tools like a general search tool, calculator, etc.
# ]

# agent = initialize_agent(
#     agent_tools,
#     llm,
#     agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
#     verbose=True # Set to True to see the agent's thought process
# )

# print("\nQuestion for Agentic RAG (Conceptual): What are common RAG patterns?")
# try:
#     agent_response = agent.run("What are common RAG patterns?")
#     print(f"Agent's Answer: {agent_response}")
# except Exception as e:
#     print(f"Could not run Agentic RAG example without full tool setup: {e}")
#     print("Agentic RAG involves a more complex orchestration layer where an agent decides how to answer the query, potentially using multiple tools including retrieval.")


print("\n--- Tutorial Complete ---")
print("This tutorial covered Naive RAG and a conceptual HyDE RAG implementation.")
print("Agentic RAG is a more advanced pattern often requiring AI agent frameworks.")

# --- Clean up ---
# Remove the dummy file and chroma db directory
import shutil
os.remove("rag_info.txt")
if os.path.exists("./chroma_db"):
    shutil.rmtree("./chroma_db")
print("\nCleaned up generated files and directories.")



In [None]:
# --- RAG Pattern 4: Parent Document Retriever ---
# An Advanced RAG pattern. Instead of retrieving small chunks directly,
# this retriever retrieves larger "parent" documents based on the query's
# similarity to smaller "child" chunks. The language model then receives
# the full parent document as context, which can provide better context
# and coherence.

print("\n--- Demonstrating Parent Document Retriever RAG ---")

from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever

# We need to create smaller "child" chunks for embedding and retrieval
child_text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
child_texts = child_text_splitter.split_documents(documents)

# We need a document store to hold the parent documents
store = InMemoryStore()

# Initialize the Parent Document Retriever
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=db,       # The vector store for searching child chunks
    docstore=store,       # The store for retrieving parent documents
    child_splitter=child_text_splitter, # The splitter used for creating child chunks
    parent_splitter=text_splitter, # The splitter used for creating parent documents (same as initial splitter)
)

# Add the documents to the retriever's stores
# This processes the documents, splits them into children, embeds children,
# stores children in the vectorstore, and stores parents in the docstore.
parent_document_retriever.add_documents(documents)

# Create the RAG chain using the Parent Document Retriever
parent_rag_chain = (
    {"context": parent_document_retriever.get_relevant_documents, "question": lambda x: x["question"]}
    | prompt # Use the same answer generation prompt
    | llm
)

# Ask a question
question_parent = "What is the core idea behind RAG?"
print(f"\nQuestion: {question_parent}")

# Invoke the Parent Document Retriever RAG chain
response_parent = parent_rag_chain.invoke({"question": question_parent})

# Extract the generated text
generated_text_parent = response_parent.split("Helpful Answer:")[-1].strip()
print(f"Generated Answer (Parent Document Retriever): {generated_text_parent}")

# --- RAG Pattern 5: Ensemble Retriever (Conceptual) ---
# An Advanced RAG pattern that combines the results of multiple different
# retrieval methods (e.g., vector search, keyword search, graph search).
# The results are then re-ranked or merged to get the final set of documents.
# This is typically done using a re-ranker or a fusion algorithm (like Reciprocal Rank Fusion).

print("\n--- Conceptual Demonstrating Ensemble Retriever RAG ---")

# Implementing a full Ensemble Retriever requires setting up multiple retrievers
# and a re-ranker or fusion mechanism.

# Conceptual Example:
# from langchain.retrievers import EnsembleRetriever
# from langchain.retrievers import BM25Retriever # Example of a keyword retriever

# # Initialize a keyword retriever
# bm25_retriever = BM25Retriever.from_documents(texts) # Uses the same text chunks as vectorstore
# bm25_retriever.k = 2 # Retrieve top 2

# # Initialize the vector store retriever
# vector_retriever = db.as_retriever(search_kwargs={"k": 2})

# # Initialize the Ensemble Retriever (without re-ranking for simplicity)
# ensemble_retriever = EnsembleRetriever(retrievers=[bm22_retriever, vector_retriever], weights=[0.5, 0.5])

# # Create the RAG chain using the Ensemble Retriever
# ensemble_rag_chain = (
#     {"context": ensemble_retriever.get_relevant_documents, "question": lambda x: x["question"]}
#     | prompt
#     | llm
# )

# print("\nQuestion for Ensemble RAG (Conceptual): What are the benefits of RAG?")
# try:
#     ensemble_response = ensemble_rag_chain.invoke({"question": "What are the benefits of RAG?"})
#     generated_text_ensemble = ensemble_response.split("Helpful Answer:")[-1].strip()
#     print(f"Generated Answer (Ensemble): {generated_text_ensemble}")
# except Exception as e:
#     print(f"Could not run Ensemble RAG example without full setup: {e}")
#     print("Ensemble RAG combines different retrieval methods to potentially improve recall.")


print("\n--- Tutorial Complete ---")
print("This tutorial covered Naive RAG, a conceptual HyDE RAG, Parent Document Retriever RAG, and conceptual Ensemble RAG.")
print("Agentic RAG and full Ensemble RAG implementations are more advanced patterns often requiring additional components or frameworks.")

# --- Clean up ---
# Remove the dummy file and chroma db directory
os.remove("rag_info.txt")
if os.path.exists("./chroma_db"):
    shutil.rmtree("./chroma_db")
print("\nCleaned up generated files and directories.")


In the context of RAG (Retrieval-Augmented Generation) systems, “RAG patterns” typically refer to the architectural or design patterns used to implement RAG pipelines. These patterns govern how the retrieval and generation steps are organized and interact with each other.

Here are the most common RAG patterns:

1. Retrieve-then-Read (Standard RAG)
Pattern:

Retrieve documents first based on the user query.

Feed them to a generative model (like GPT or LLaMA) to produce an answer.

Use Case: Most common and scalable RAG implementation.

2. Read-Retrieve-Read
Pattern:

First “read” the query to rewrite or enrich it (e.g., query expansion or rephrasing).

Then retrieve documents using the refined query.

Finally generate an answer based on retrieved content.

Use Case: Helps improve retrieval quality when original queries are ambiguous or under-specified.

3. Retrieve-Rank-Read
Pattern:

Retrieve a large set of candidate documents.

Rank them using a more sophisticated scorer (e.g., BERT-based cross-encoder).

Select top-k documents and pass to the generator.

Use Case: When retrieval quality is critical. Adds a second step to boost precision.

4. Multi-step RAG (Iterative Retrieval)
Pattern:

Run multiple rounds of retrieval and generation.

Each round refines the query based on the previous response.

Use Case: Complex tasks like multi-hop QA, research agents, or chain-of-thought reasoning.

5. Fusion-in-Decoder (FiD)
Pattern:

All retrieved documents are encoded independently, then fused at the decoder level (e.g., T5-based models).

Use Case: Used in models like FiD where the decoder integrates evidence from multiple sources simultaneously.

6. Hybrid RAG (Dense + Sparse Retrieval)
Pattern:

Combines both dense retrieval (e.g., using vector similarity) and sparse retrieval (e.g., BM25).

Merges or re-ranks results before feeding to the generation model.

Use Case: Improves coverage and recall of relevant documents.

7. Agentic RAG / Tool-augmented RAG
Pattern:

Uses an agent that can retrieve, reason, and plan.

The agent might invoke multiple retrievals, tools, or even external APIs before generating a final response.

Use Case: Complex task-solving, especially in enterprise AI assistants or copilots.

8. Retriever-Augmented Planning
Pattern:

Planner decomposes tasks into sub-tasks.

Each sub-task may involve retrieval + generation.

Use Case: Long-document QA, multi-turn conversations, or coding agents.

In [None]:
# Retrieve-then-Read (Standard RAG), Read-Retrieve-Read, Retrieve-Rank-Read,
# Multi-step RAG (Iterative Retrieval), Fusion-in-Decoder (FiD), Hybrid RAG (Dense + Sparse Retrieval)

import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import shutil
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_core.output_parsers import StrOutputParser
from typing import List
from langchain_core.documents import Document

# This tutorial demonstrates various Retrieval Augmented Generation (RAG) patterns
# using Python and open-source libraries.

# Ensure necessary libraries are installed
!pip install -q langchain-community langchain langchain-core chromadb transformers torch datasets rank_bm25

# --- Setup ---

# 1. Create a dummy text file for demonstration
dummy_text = """
Retrieval Augmented Generation (RAG) is a technique that enhances
the ability of language models to generate more accurate and informative
responses. It combines the power of pre-trained language models with
external knowledge retrieval systems.

The core idea is to first retrieve relevant documents or information chunks
from a knowledge base based on the user's query. These retrieved pieces of
information are then used as context for the language model to generate a
response. This approach helps mitigate the limitations of large language models,
such as hallucination and outdated information, by grounding the generation in
real-world data.

RAG has several benefits. It can provide more accurate and factual answers,
reduce hallucinations, and allow the model to access and utilize information
that was not present in its training data. It is particularly useful for
tasks requiring up-to-date information or domain-specific knowledge.

Common RAG patterns include Naive RAG, Advanced RAG (like Recursive Retrieval or
HyDE), and Agentic RAG. Other patterns include Retrieve-Rank-Read, Multi-step
or Iterative Retrieval, Fusion-in-Decoder (FiD), and Hybrid RAG combining
dense and sparse retrieval. Each pattern offers different strategies for retrieval
and integration with the language model.
"""

with open("rag_info.txt", "w") as f:
    f.write(dummy_text)

# 2. Load the document
loader = TextLoader("rag_info.txt")
documents = loader.load()

# 3. Split the document into chunks
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

# 4. Initialize the Embedding Model
# We'll use a common sentence transformer model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 5. Initialize the Vector Store (ChromaDB in this case)
# This stores the embedded chunks and allows for efficient similarity search
db = Chroma.from_documents(texts, embeddings, persist_directory="./chroma_db")

# 6. Initialize the Language Model (using HuggingFace transformers pipeline)
# We'll use a small causal language model for demonstration purposes
model_name = "distilbert/distilgpt2" # A smaller model for faster execution
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token # Needed for some models with pipeline
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device=0 if torch.cuda.is_available() else -1, # Use GPU if available
    max_new_tokens=100, # Limit generated tokens for faster results
    do_sample=True,
    top_k=50,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id # Set pad token id for pipeline
)

llm = HuggingFacePipeline(pipeline=pipe)

# Define the standard RAG prompt template
template = """Use the following pieces of context to answer the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use a maximum of three sentences.

Context:
{context}

Question:
{question}

Helpful Answer:"""
prompt = ChatPromptTemplate.from_template(template)

# Helper function to extract text from pipeline response
def extract_generated_text(response):
    # This part might need adjustment based on the specific LLM and pipeline output format.
    # A common pattern is to split by the 'Helpful Answer:' part.
    # Sometimes the LLM generates extra text before the "Helpful Answer:",
    # or the prompt is included. We try to handle common cases.
    if "Helpful Answer:" in response:
        return response.split("Helpful Answer:")[-1].strip()
    # If the prompt structure isn't perfectly followed, try a simpler split or just return response
    lines = response.strip().split('\n')
    # Find the last non-empty line that doesn't look like the question/context part
    answer_lines = []
    found_answer_start = False
    for line in reversed(lines):
        line = line.strip()
        if line and not line.startswith("Context:") and not line.startswith("Question:"):
            answer_lines.append(line)
            found_answer_start = True
        elif found_answer_start:
            break # Stop if we find empty lines after the potential answer
    return "\n".join(reversed(answer_lines)).strip()


# --- RAG Pattern 1: Retrieve-then-Read (Standard RAG) ---
# The most basic pattern: retrieve top-k documents and pass them directly
# as context to the language model.

print("\n--- Demonstrating Retrieve-then-Read (Standard RAG) ---")

# Create a retriever from the vector store
retriever = db.as_retriever(search_kwargs={"k": 2}) # Retrieve top 2 chunks

# Create the RAG chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} # Use RunnablePassthrough for question
    | prompt
    | llm
    | StrOutputParser() # Parse the output to string
)

# Ask a question
question_std = "What is Retrieval Augmented Generation?"
print(f"\nQuestion: {question_std}")

# Invoke the RAG chain
response_std = rag_chain.invoke(question_std)
print(f"Generated Answer (Standard RAG): {response_std}")


# --- RAG Pattern 2: Read-Retrieve-Read (Query Expansion/Rewriting) ---
# First "read" the query to potentially rewrite or expand it using an LLM,
# then use the expanded query for retrieval. The final "read" is the generation step.

print("\n--- Demonstrating Read-Retrieve-Read (Query Rewriting) ---")

# LLM Chain to rewrite or expand the query
query_rewrite_prompt_template = """You are a helpful assistant that rewrites user questions to improve information retrieval.
Rewrite the following question to make it more effective for searching a document database:

Question: {question}

Rewritten Question:"""
query_rewrite_prompt = ChatPromptTemplate.from_template(query_rewrite_prompt_template)
query_rewrite_chain = query_rewrite_prompt | llm | StrOutputParser()

# Custom Retriever that uses the rewritten query
class RewritingRetriever:
    def __init__(self, rewrite_chain, vectorstore, k=4):
        self.rewrite_chain = rewrite_chain
        self.vectorstore = vectorstore
        self.k = k

    def get_relevant_documents(self, query):
        # Rewrite the query
        rewritten_query = self.rewrite_chain.invoke({"question": query})
        # Clean up potential extra text from the LLM
        rewritten_query = rewritten_query.split("Rewritten Question:")[-1].strip()
        print(f"\nOriginal Question: {query}")
        print(f"Rewritten Question for Retrieval: {rewritten_query}")
        # Retrieve documents using the rewritten query
        retrieved_docs = self.vectorstore.similarity_search(rewritten_query, k=self.k)
        return retrieved_docs

# Initialize the Rewriting Retriever
rewriting_retriever = RewritingRetriever(query_rewrite_chain, db, k=2)

# Create the RAG chain using the Rewriting Retriever
read_retrieve_read_chain = (
    {"context": rewriting_retriever.get_relevant_documents, "question": RunnablePassthrough()}
    | prompt # Use the standard answer generation prompt
    | llm
    | StrOutputParser()
)

# Ask a question (maybe one that could benefit from rewriting)
question_rewrite = "Tell me about RAG's good points."
print(f"\nQuestion: {question_rewrite}")

# Invoke the Read-Retrieve-Read RAG chain
response_rewrite = read_retrieve_read_chain.invoke(question_rewrite)
print(f"Generated Answer (Read-Retrieve-Read): {response_rewrite}")


# --- RAG Pattern 3: Retrieve-Rank-Read ---
# Retrieve a larger set of documents, then re-rank them to select the most relevant
# before passing to the language model. This requires a separate re-ranking step.
# For simplicity, we'll simulate ranking by just using a combination of retrievers.
# A proper implementation would use a cross-encoder re-ranker.

print("\n--- Demonstrating Retrieve-Rank-Read (Conceptual) ---")

# This requires a re-ranker model, which we won't set up fully here.
# A conceptual idea often involves retrieving more documents initially
# and potentially using a different method or a dedicated re-ranking model
# to select the final set.

# Conceptual simulation: Retrieve more docs initially
retriever_wide = db.as_retriever(search_kwargs={"k": 4}) # Retrieve top 4 chunks

# In a real Retrieve-Rank-Read, you'd insert a re-ranking step here
# For example:
# from langchain.retrievers import ContextualCompressionRetriever
# from langchain.retrievers.document_compressors import CrossEncoderReranker
# from transformers import CrossEncoder
#
# reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-2-v2") # Example re-ranker
# compressor = CrossEncoderReranker(model=reranker_model, top_n=2) # Rank and select top 2
# compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever_wide)

# For this demo, we'll just use the wider retrieval as a proxy.
retrieve_rank_read_retriever = retriever_wide # Using the wider retriever

# Create the RAG chain using the (conceptually ranked) wider retrieval
retrieve_rank_read_chain = (
    {"context": retrieve_rank_read_retriever, "question": RunnablePassthrough()}
    | prompt # Use the standard answer generation prompt
    | llm
    | StrOutputParser()
)

# Ask a question
question_rank = "What are the various RAG patterns mentioned?"
print(f"\nQuestion: {question_rank}")

# Invoke the Retrieve-Rank-Read RAG chain (conceptual)
response_rank = retrieve_rank_read_chain.invoke(question_rank)
print(f"Generated Answer (Retrieve-Rank-Read Conceptual): {response_rank}")


# --- RAG Pattern 4: Multi-step RAG (Iterative Retrieval) ---
# An Advanced RAG pattern where the model can perform multiple retrieval steps
# to refine its understanding or gather more information needed to answer a query.
# This often involves an agent or a carefully designed chain that uses the output
# of a previous step to inform the next retrieval.

print("\n--- Demonstrating Multi-step RAG (Iterative Retrieval - Conceptual) ---")

# Implementing true multi-step RAG is complex and typically involves agents or
# custom looping logic where the LLM decides if it needs more information
# and formulates a new query based on the current context and the previous output.

# Conceptual idea: A simple chain that first gets initial docs, then
# potentially asks a clarifying question or performs a second retrieval.
# This is difficult to implement generatively with a basic pipeline setup.

# For demonstration, we can simulate a simple two-step process:
# 1. Initial Retrieval and Generation attempt.
# 2. (Conceptual): If the first answer is insufficient (e.g., short, says it doesn't know),
#    reformulate the question or retrieve differently.

# A simplified iterative approach might look like this (requires more LLM guidance):
# Initial response chain (same as standard RAG)
# first_step_chain = (
#     {"context": db.as_retriever(search_kwargs={"k": 1}), "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )
#
# print("\nInitial attempt at Multi-step RAG:")
# initial_response = first_step_chain.invoke("Describe the limitations of RAG.")
# print(f"First step response: {initial_response}")
#
# # Now, conceptually, you'd analyze `initial_response`. If it's not good,
# # you might use an LLM to decide to retrieve more or ask a follow-up.
# # Example: If initial_response is "I don't know", trigger a different retrieval or query.
#
# # This loop/conditional logic is the core of iterative RAG.
# # It requires a more sophisticated control flow than simple chain sequencing.
# # LangGraph or similar frameworks are well-suited for this.

print("Multi-step RAG typically requires agents or complex state management.")
print("Demonstration here is conceptual and does not fully implement iterative retrieval.")

# --- RAG Pattern 5: Fusion-in-Decoder (FiD) ---
# This is an architecture pattern rather than a pipeline pattern. It uses models
# specifically designed to handle multiple input documents by encoding them
# independently and then fusing this information in the decoder.
# This requires a FiD-architecture model (like T5-based FiD).

print("\n--- Demonstrating Fusion-in-Decoder (FiD) ---")

# Implementing FiD requires using a model with the FiD architecture,
# and preparing the input data by concatenating the prompt and multiple
# document contexts in a specific format the model expects.
# We are using a simple causal LLM (distilgpt2), which is not a FiD model.

# To implement FiD, you would need to:
# 1. Load a FiD model (e.g., using `transformers.AutoModelForSeq2SeqLM.from_pretrained("google/t5-small-fid")`).
# 2. Prepare the input by formatting the query and each retrieved document.
# 3. Pass this specially formatted input to the FiD model.

print("Demonstration requires a FiD model, which is not used in this setup (using distilgpt2).")
print("FiD integrates multiple documents at the model's decoder level.")


# --- RAG Pattern 6: Hybrid RAG (Dense + Sparse Retrieval) ---
# Combines different retrieval methods, typically dense (vector search) and
# sparse (keyword search like BM25). The results are often merged or re-ranked.

print("\n--- Demonstrating Hybrid RAG (Dense + Sparse Retrieval) ---")

# Initialize a dense retriever (from our vector store)
dense_retriever = db.as_retriever(search_kwargs={"k": 2})

# Initialize a sparse retriever (BM25)
bm25_retriever = BM25Retriever.from_documents(texts)
bm25_retriever.k = 2

# Initialize an Ensemble Retriever to combine the results
# Weights determine the contribution of each retriever. Reciprocal Rank Fusion
# is a common method for merging results from different retrievers.
ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, bm25_retriever], weights=[0.5, 0.5])

# Create the RAG chain using the Ensemble Retriever
hybrid_rag_chain = (
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
    | prompt # Use the standard answer generation prompt
    | llm
    | StrOutputParser()
)

# Ask a question
question_hybrid = "What is RAG and why is it beneficial?"
print(f"\nQuestion: {question_hybrid}")

# Invoke the Hybrid RAG chain
response_hybrid = hybrid_rag_chain.invoke(question_hybrid)
print(f"Generated Answer (Hybrid RAG): {response_hybrid}")


# --- RAG Pattern 4 (Revisited): Parent Document Retriever ---
# (Moved down to group with Advanced RAG patterns)
# An Advanced RAG pattern. Instead of retrieving small chunks directly,
# this retriever retrieves larger "parent" documents based on the query's
# similarity to smaller "child" chunks. The language model then receives
# the full parent document as context, which can provide better context
# and coherence.

print("\n--- Demonstrating Parent Document Retriever RAG ---")

# We need to create smaller "child" chunks for embedding and retrieval
child_text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
child_texts = child_text_splitter.split_documents(documents) # Split the *original* documents

# We need a document store to hold the parent documents
# Clear previous store if it exists
store = InMemoryStore()

# Initialize the Parent Document Retriever
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=db,       # The vector store for searching child chunks (uses `db` which contains embeddings of `texts`)
    docstore=store,       # The store for retrieving parent documents
    child_splitter=child_text_splitter, # The splitter used for creating child chunks
    parent_splitter=text_splitter, # The splitter used for creating parent documents (same as initial splitter)
)

# Add the documents to the retriever's stores
# This processes the *original* documents, splits them into children, embeds children,
# stores children in the vectorstore (implicitly adds via `vectorstore=db`),
# and stores parents in the docstore.
parent_document_retriever.add_documents(documents) # Add original documents

# Create the RAG chain using the Parent Document Retriever
parent_rag_chain = (
    {"context": parent_document_retriever, "question": RunnablePassthrough()}
    | prompt # Use the same answer generation prompt
    | llm
    | StrOutputParser()
)

# Ask a question
question_parent = "What is the core idea behind RAG?"
print(f"\nQuestion: {question_parent}")

# Invoke the Parent Document Retriever RAG chain
response_parent = parent_rag_chain.invoke(question_parent)
print(f"Generated Answer (Parent Document Retriever): {response_parent}")

# --- RAG Pattern 7 & 8: Agentic RAG / Retriever-Augmented Planning ---
# These patterns involve more complex orchestration, often requiring AI agents
# that can decide when and how to use retrieval, potentially in multiple steps
# or as part of a planning process. This is beyond the scope of a basic sequential
# chain setup.

print("\n--- Conceptual Demonstrating Agentic RAG / Retriever-Augmented Planning ---")
print("These patterns involve complex AI agents and planning frameworks (e.g., LangGraph).")
print("Implementing them requires setting up tools for the agent and defining its decision-making process.")
print("They are used for complex tasks, multi-turn interactions, and dynamic use of tools including retrieval.")

print("\n--- Tutorial Complete ---")
print("This tutorial demonstrated implementations of Retrieve-then-Read, Read-Retrieve-Read (Query Rewriting),")
print("Hybrid RAG, and Parent Document Retriever RAG.")
print("Conceptual outlines were provided for Retrieve-Rank-Read, Multi-step/Iterative RAG, Fusion-in-Decoder,")
print("and Agentic/Planning RAG due to their increased complexity or specific model requirements.")


# --- Clean up ---
# Remove the dummy file and chroma db directory
os.remove("rag_info.txt")
if os.path.exists("./chroma_db"):
    shutil.rmtree("./chroma_db")
print("\nCleaned up generated files and directories.")