# Multi-document Agentic RAG using Llama-Index and Mistral


In [9]:
%%writefile requirements.txt
llama-index
llama-index-llms-huggingface
llama-index-embeddings-fastembed
fastembed
Unstructured[md]
chromadb
llama-index-vector-stores-chroma
llama-index-llms-groq
einops
accelerate
sentence-transformers
llama-index-llms-mistralai
llama-index-llms-openai

Overwriting requirements.txt


In [10]:
!pip install -r requirements.txt



In [11]:
!pip install wget
import wget

!wget --version

zsh:1: command not found: wget


## Download files to process

In [4]:
import wget

!mkdir -p data
# Download the files
!wget "https://arxiv.org/pdf/1810.04805.pdf" -O data/BERT_arxiv.pdf
!wget "https://arxiv.org/pdf/2005.11401.pdf" -O data/RAG_arxiv.pdf
!wget "https://arxiv.org/pdf/2310.11511.pdf" -O data/self_rag_arxiv.pdf
!wget "https://arxiv.org/pdf/2401.15884.pdf" -O data/crag_arxiv.pdf

zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget


In [5]:
from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,SummaryIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import FunctionTool,QueryEngineTool
from llama_index.core.vector_stores import MetadataFilters,FilterCondition
from typing import List,Optional

In [6]:
import  nest_asyncio
nest_asyncio.apply()

In [12]:
documents = SimpleDirectoryReader(input_files = ['data/situationalawareness.pdf']).load_data()
print(len(documents))
print(f"Document Metadata: {documents[0].metadata}")

165
Document Metadata: {'page_label': '1', 'file_name': 'situationalawareness.pdf', 'file_path': 'data/situationalawareness.pdf', 'file_type': 'application/pdf', 'file_size': 21371840, 'creation_date': '2024-08-18', 'last_modified_date': '2024-08-18'}


## Split the documents into chunks/nodes


In [13]:
splitter = SentenceSplitter(chunk_size=1024,chunk_overlap=100)
nodes = splitter.get_nodes_from_documents(documents)
print(f"Length of nodes : {len(nodes)}")
print(f"get the content for node 0 :{nodes[0].get_content(metadata_mode='all')}")

Length of nodes : 165
get the content for node 0 :page_label: 1
file_name: situationalawareness.pdf
file_path: data/situationalawareness.pdf
file_type: application/pdf
file_size: 21371840
creation_date: 2024-08-18
last_modified_date: 2024-08-18

Leopold Aschenbrenner
S I T U AT I O N A L AWA R E N E S S
The Decade Ahead
JUNE 2024


## Instantitate the vectorstore

In [14]:
import chromadb
db = chromadb.PersistentClient(path="./chroma_db_mistral")
chroma_collection = db.get_or_create_collection("multidocument-agent")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [15]:
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import Settings
#
embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
#
Settings.embed_model = embed_model
#
Settings.chunk_size = 1024
#

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/66.5M [00:00<?, ?B/s]

In [16]:
import os
from llama_index.llms.openai import OpenAI

# Retrieve OpenAI API key from environment variable
api_key = os.getenv("OPENAI_API_KEY")

if api_key is None:
    raise ValueError("OpenAI API key not found in environment variables.")

# Set the OpenAI API key
os.environ["OPENAI_API_KEY"] = api_key

# Instantiate the OpenAI model
llm = OpenAI(model="gpt-4")  # Use the appropriate model here

In [17]:
#instantiate Vectorstore
name = "nuclear-series-final"
vector_index = VectorStoreIndex(nodes,storage_context=storage_context)
vector_index.storage_context.vector_store.persist(persist_path="/content/chroma_db")
#
# Define Vectorstore Autoretrieval tool
def vector_query(query:str,page_numbers:Optional[List[str]]=None)->str:
  '''
  perform vector search over index on
  query(str): query string needs to be embedded
  page_numbers(List[str]): list of page numbers to be retrieved,
                          leave blank if we want to perform a vector search over all pages
  '''
  page_numbers = page_numbers or []
  metadata_dict = [{"key":'page_label',"value":p} for p in page_numbers]
  #
  query_engine = vector_index.as_query_engine(similarity_top_k =2,
                                              filters = MetadataFilters.from_dicts(metadata_dict,
                                                                                    condition=FilterCondition.OR)
                                              )
  #
  response = query_engine.query(query)
  return response
#
#llamiondex FunctionTool wraps any python function we feed it
vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool_{name}",
                                              fn=vector_query)
# Prepare Summary Tool
summary_index = SummaryIndex(nodes)
summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize",
                                                      se_async=True,)
summary_query_tool = QueryEngineTool.from_defaults(name=f"summary_tool_{name}",
                                                    query_engine=summary_query_engine,
                                                  description=("Use ONLY IF you want to get a holistic summary of the documents."
                                              "DO NOT USE if you have specified questions over the documents."))

In [18]:
response = llm.predict_and_call([vector_query_tool],
                                "Summarize the contentin this document",
                                verbose=True)

ValueError: Expected at least one tool call, but got 0 tool calls.

In [19]:
def get_doc_tools(file_path:str,name:str)->str:
  '''
  get vector query and sumnmary query tools from a document
  '''
  #load documents
  documents = SimpleDirectoryReader(input_files = [file_path]).load_data()
  print(f"length of nodes")
  splitter = SentenceSplitter(chunk_size=1024,chunk_overlap=100)
  nodes = splitter.get_nodes_from_documents(documents)
  print(f"Length of nodes : {len(nodes)}")
  #instantiate Vectorstore
  vector_index = VectorStoreIndex(nodes,storage_context=storage_context)
  vector_index.storage_context.vector_store.persist(persist_path="/content/chroma_db")
  #
  # Define Vectorstore Autoretrieval tool
  def vector_query(query:str,page_numbers:Optional[List[str]]=None)->str:
    '''
    perform vector search over index on
    query(str): query string needs to be embedded
    page_numbers(List[str]): list of page numbers to be retrieved,
                            leave blank if we want to perform a vector search over all pages
    '''
    page_numbers = page_numbers or []
    metadata_dict = [{"key":'page_label',"value":p} for p in page_numbers]
    #
    query_engine = vector_index.as_query_engine(similarity_top_k =2,
                                                filters = MetadataFilters.from_dicts(metadata_dict,
                                                                                     condition=FilterCondition.OR)
                                                )
    #
    response = query_engine.query(query)
    return response
  #
  #llamiondex FunctionTool wraps any python function we feed it
  vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool_{name}",
                                                fn=vector_query)
  # Prepare Summary Tool
  summary_index = SummaryIndex(nodes)
  summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize",
                                                       se_async=True,)
  summary_query_tool = QueryEngineTool.from_defaults(name=f"summary_tool_{name}",
                                                     query_engine=summary_query_engine,
                                                    description=("Use ONLY IF you want to get a holistic summary of the documents."
                                                "DO NOT USE if you have specified questions over the documents."))
  return vector_query_tool,summary_query_tool


In [21]:
import os
root_path = "data"
file_name = []
file_path = []
for file in os.listdir(root_path):
  if file.endswith(".pdf"):
    file_name.append(file.split(".")[0])
    file_path.append(os.path.join(root_path,file))
#
print(file_name)
print(file_path)

['situationalawareness']
['data/situationalawareness.pdf']


In [22]:
papers_to_tools_dict = {}
for name,filename in zip(file_name,file_path):
  vector_query_tool,summary_query_tool = get_doc_tools(filename,name)
  papers_to_tools_dict[name] = [vector_query_tool,summary_query_tool]


length of nodes
Length of nodes : 165


In [25]:
initial_tools = [t for f in file_name for t in papers_to_tools_dict[f]]
initial_tools

[<llama_index.core.tools.function_tool.FunctionTool at 0x148d6a3f0>,
 <llama_index.core.tools.query_engine.QueryEngineTool at 0x1514d81d0>]

In [24]:
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex
#
obj_index = ObjectIndex.from_objects(initial_tools,index_cls=VectorStoreIndex)
#

In [26]:
obj_retriever = obj_index.as_retriever(similarity_top_k=2)
tools = obj_retriever.retrieve("summary of the nuclear industry")
#
print(tools[0].metadata)
print(tools[1].metadata)

ToolMetadata(description='Use ONLY IF you want to get a holistic summary of the documents.DO NOT USE if you have specified questions over the documents.', name='summary_tool_situationalawareness', fn_schema=<class 'llama_index.core.tools.types.DefaultToolFnSchema'>, return_direct=False)
ToolMetadata(description='vector_tool_situationalawareness(query: str, page_numbers: Optional[List[str]] = None) -> str\n\n    perform vector search over index on\n    query(str): query string needs to be embedded\n    page_numbers(List[str]): list of page numbers to be retrieved,\n                            leave blank if we want to perform a vector search over all pages\n    ', name='vector_tool_situationalawareness', fn_schema=<class 'pydantic.v1.main.vector_tool_situationalawareness'>, return_direct=False)


In [27]:
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner
#
agent_worker = FunctionCallingAgentWorker.from_tools(tool_retriever=obj_retriever,
                                                     llm=llm,
                                                     system_prompt="""You are an agent designed to answer queries over a set of given documents.
                                                     Please always use the tools provided to answer a question.Do not rely on prior knowledge.""",
                                                     verbose=True)
agent = AgentRunner(agent_worker)

In [28]:
#
response = agent.query("Please summarize this document")
print(str(response))

Added user message to memory: Please summarize this document
=== Calling Function ===
Calling function: summary_tool_situationalawareness with args: {"input": "document"}
=== Function Output ===
The document extensively discusses the advancements in AI technology, particularly focusing on language models like GPT-2, GPT-3, and GPT-4. It highlights the significant progress made in algorithmic efficiencies, compute scaling, and "unhobbling" techniques to enhance the capabilities of these models. The text also delves into the potential future developments, such as enabling models to use computers, improving test-time compute capabilities, and transitioning from chatbots to more advanced agents or drop-in remote workers. The document predicts another substantial leap in AI capabilities by the end of 2027, building upon the advancements seen from GPT-2 to GPT-4.
=== LLM Response ===
The document discusses the advancements in AI technology, particularly focusing on language models like GPT-2

In [32]:
response = agent.query("What about UBI?")
print(str(response))

Added user message to memory: What about UBI?
=== Calling Function ===
Calling function: vector_tool_situationalawareness with args: {"query": "UBI"}
=== Function Output ===
Universal Basic Income (UBI) is a concept that involves providing all citizens with a regular, unconditional sum of money, without any means test or work requirement.
=== LLM Response ===
Universal Basic Income (UBI) is a concept that involves providing all citizens with a regular, unconditional sum of money, without any means test or work requirement.
Universal Basic Income (UBI) is a concept that involves providing all citizens with a regular, unconditional sum of money, without any means test or work requirement.
