# Advanced RAG pipeline with Multiple Data Source

In [1]:
# !pip install arxiv

This code sets up a powerful question-answering system that can handle queries about various topics by leveraging different tools:

1. Wikipedia for general knowledge queries.
2. A custom retriever for questions about LangSmith (a LangChain tool).
3. arXiv for queries about scientific papers.

The system uses OpenAI's GPT-3.5 model/HuggingFace Mistral Open Source model with function calling to create an agent that can understand the capabilities of these tools and decide which one to use for a given query. The AgentExecutor provides a high-level interface to interact with this agent, handling the complexities of tool selection and execution.

This setup demonstrates the power of combining large language models with structured tools and agents, enabling the creation of intelligent, multi-capable systems that can handle a wide range of information-retrieval and question-answering tasks.

In [2]:
# Import the WikipediaQueryRun tool from langchain_community.tools module.
# This tool allows querying Wikipedia for information.
from langchain_community.tools import WikipediaQueryRun

# Import the WikipediaAPIWrapper utility from langchain_community.utilities module.
# This wrapper provides methods to interact with the Wikipedia API.
from langchain_community.utilities import WikipediaAPIWrapper

In [3]:
# Create an instance of WikipediaAPIWrapper with custom settings.
# top_k_results=1 means it will return only the most relevant result.
# doc_content_chars_max=200 limits the content to the first 200 characters.
api_wrapper = WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=200)

# Create a WikipediaQueryRun tool using the configured API wrapper.
# This tool will use the wrapper to query Wikipedia when invoked.
wiki = WikipediaQueryRun(api_wrapper=api_wrapper)

In [4]:
# Display the name of the Wikipedia tool. This is useful for identification in agent-based systems.
wiki.name

'wikipedia'

In [5]:
# Import WebBaseLoader for loading web pages, FAISS for vector storage,
# OllamaEmbeddings for text embeddings, and RecursiveCharacterTextSplitter for text splitting.
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create a WebBaseLoader instance to load content from the LangSmith documentation.
loader = WebBaseLoader("https://docs.smith.langchain.com/")

# Load the web content into a list of documents.
docs = loader.load()

# Split the loaded documents into smaller chunks for better processing.
# chunk_size=1000 sets each chunk to about 1000 characters.
# chunk_overlap=200 allows 200 characters of overlap between chunks to maintain context.
documents = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_documents(docs)

# Create a FAISS vector database from the document chunks using OpenAI embeddings.
# FAISS allows for fast similarity searches.
vectordb = FAISS.from_documents(documents, OllamaEmbeddings(model="gemma:2b"))

# Convert the FAISS vector database into a retriever.
# Retrievers provide a simple interface to fetch relevant documents.
retriever = vectordb.as_retriever()

# Display the retriever object.
retriever

VectorStoreRetriever(tags=['FAISS', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000025428E43D90>)

In [6]:
# Import the create_retriever_tool function from langchain.tools.retriever.
# This function creates a tool that uses a retriever to find information.
from langchain.tools.retriever import create_retriever_tool

# Create a retriever tool using the FAISS retriever.
# The tool is named "langsmith_search" and has a description that guides its use.
retriever_tool = create_retriever_tool(
   retriever, "langsmith_search",
   "Search for information about LangSmith. For any questions about LangSmith, you must use this tool!"
)

In [7]:
retriever_tool.name

'langsmith_search'

In [8]:
## Arxiv Tool

# Import ArxivAPIWrapper and ArxivQueryRun for interacting with the arXiv API.
# arXiv is a repository of electronic preprints for scientific papers.
from langchain_community.utilities import ArxivAPIWrapper
from langchain_community.tools import ArxivQueryRun

# Create an ArxivAPIWrapper instance with custom settings.
# Similar to the Wikipedia wrapper, it limits results and content size.
arxiv_wrapper = ArxivAPIWrapper(top_k_results=1, doc_content_chars_max=200)

# Create an ArxivQueryRun tool using the configured arXiv wrapper.
arxiv = ArxivQueryRun(api_wrapper=arxiv_wrapper)

# Display the name of the arXiv tool.
arxiv.name

'arxiv'

In [9]:
# Create a list of tools that will be used by the agent.
tools = [wiki, arxiv, retriever_tool]

In [10]:
# Display the list of tools.
tools

[WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper(wiki_client=<module 'wikipedia' from 'c:\\Users\\rahul gupta\\.conda\\envs\\myenv\\Lib\\site-packages\\wikipedia\\__init__.py'>, top_k_results=1, lang='en', load_all_available_meta=False, doc_content_chars_max=200)),
 ArxivQueryRun(api_wrapper=ArxivAPIWrapper(arxiv_search=<class 'arxiv.Search'>, arxiv_exceptions=(<class 'arxiv.ArxivError'>, <class 'arxiv.UnexpectedEmptyPageError'>, <class 'arxiv.HTTPError'>), top_k_results=1, ARXIV_MAX_QUERY_LENGTH=300, continue_on_failure=False, load_max_docs=100, load_all_available_meta=False, doc_content_chars_max=200, arxiv_result=<class 'arxiv.Result'>)),
 Tool(name='langsmith_search', description='Search for information about LangSmith. For any questions about LangSmith, you must use this tool!', args_schema=<class 'langchain_core.tools.RetrieverInput'>, func=functools.partial(<function _get_relevant_documents at 0x00000254127CF9C0>, retriever=VectorStoreRetriever(tags=['FAISS', 'OllamaEmbeddings

In [11]:
# Import load_dotenv to load environment variables from a .env file.
from dotenv import load_dotenv

# Load environment variables from a .env file in the current directory.
load_dotenv()

# Import the os module for interacting with the operating system.
import os

# Set the OPENAI_API_KEY environment variable using the value from the .env file.
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

# Import the ChatOpenAI model from langchain_openai.
from langchain_openai import ChatOpenAI
from langchain_community.llms import Ollama
# Initialize the ChatOpenAI model with specific parameters.
# model="gpt-3.5-turbo-0125" specifies the model version.
# temperature=0 makes the output more deterministic (less random).

# llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)


# Using opensource llm that can interact with agents
from langchain_community.llms import HuggingFaceHub

llm=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}

)
llm = Ollama(model='gemma:2b')

  warn_deprecated(
  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Import the hub module from langchain for accessing community-contributed resources.
from langchain import hub

# Pull a pre-defined prompt template from the LangChain hub.
# This prompt is designed for creating an OpenAI functions-based agent.
prompt = hub.pull("hwchase17/openai-functions-agent")

# Display the messages (structure) of the pulled prompt.
prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [13]:
### Agents

# Import the create_openai_tools_agent function from langchain.agents.
# This function creates an agent that uses OpenAI's function calling API.
from langchain.agents import create_openai_tools_agent

# Create an OpenAI tools agent using the LLM, tools, and prompt.
# This agent can understand tool descriptions and decide which tool to use.
agent = create_openai_tools_agent(llm, tools, prompt)

In [17]:
## Agent Executer

# Import the AgentExecutor from langchain.agents.
# AgentExecutor is a higher-level interface for running agents.
from langchain.agents import AgentExecutor

# Create an AgentExecutor with the agent and tools.
# verbose=True means it will print out the agent's thought process.
agent_executor = AgentExecutor(agent=agent, tools=tools)

# Display the agent_executor object.
agent_executor

AgentExecutor(agent=RunnableMultiActionAgent(runnable=RunnableAssign(mapper={
  agent_scratchpad: RunnableLambda(lambda x: format_to_openai_tool_messages(x['intermediate_steps']))
})
| ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]], 'agent_scratchpad': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, metadata={'lc_hub_owner': 'hwchase17', 'lc_hub_repo': 'openai-functions-agent', 'lc_hub_commit_hash': 'a1655024b06afbd95d

In [18]:
# Invoke the agent_executor with a query about a scientific paper.
# The agent will decide to use the arxiv tool to find and summarize this paper.
agent_executor.invoke({"input": "What's the paper 1605.08386 about?"})

ValueError: This output parser only works on ChatGeneration output

In [19]:
# Invoke the agent_executor with a query about LangSmith.
# The agent will decide to use the retriever_tool to answer this.
agent_executor.invoke({"input": "Tell me about Langsmith"})

ValueError: This output parser only works on ChatGeneration output