In [1]:
from dotenv import load_dotenv
load_dotenv()
import os

In [2]:
# !pip install llama-parse llama-index llama-index-postprocessor-sbert-rerank


# Set the LLM and  embedding model

In [3]:
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.2)

# Download the test sample file

For parsing, lets use a recent [paper](https://huggingface.co/papers/2403.09611) on Multi-Modal pretraining

In [5]:
import requests

url = "https://arxiv.org/pdf/2403.09611.pdf"
response = requests.get(url)

with open("paper.pdf", "wb") as file:
    file.write(response.content)

# Parse the document
Below, we can tell the parser to skip content we don't want. In this case, the references section will just add noise to a RAG system.

### Note this would give you 1 document per page



In [4]:
from llama_parse import LlamaParse

parser = LlamaParse(
    result_type="markdown",
)

In [5]:
documents = await parser.aload_data("paper.pdf")

Started parsing the file under job_id b990ae15-4bac-4752-af38-4fe18ffa3c2e


In [None]:
len(documents)

In [None]:
documents

In [6]:
import nest_asyncio

nest_asyncio.apply()

from llama_index.core.node_parser import (
    MarkdownElementNodeParser,
    SentenceSplitter,
)

# explicitly extract tables with the MarkdownElementNodeParser
node_parser = MarkdownElementNodeParser(num_workers=8)
nodes = node_parser.get_nodes_from_documents(documents)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
2it [00:00, 1994.91it/s]
0it [00:00, ?it/s]
2it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
3it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
2it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [None]:
len(nodes)

In [None]:
nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [None]:
len(nodes)

In [None]:
nodes[0]

In [None]:
len(objects)

# Chain splitters to ensure chunk size requirements are met

In [22]:

re_split_nodes = SentenceSplitter(chunk_size=512, chunk_overlap=20).get_nodes_from_documents(
    nodes
)

In [None]:
len(re_split_nodes)

In [None]:
re_split_nodes[0]

In [None]:
re_split_nodes[1]

# Chat over the paper, lets find out what it is about!


In [None]:
from llama_index.core import VectorStoreIndex, SummaryIndex, KnowledgeGraphIndex

vector_index = VectorStoreIndex(nodes=nodes)
summary_index = SummaryIndex(nodes=nodes)
knowledgegraph_index = KnowledgeGraphIndex(nodes=nodes)

In [26]:
from llama_index.core import PropertyGraphIndex
propertygraph_index = PropertyGraphIndex(nodes=nodes)

In [None]:
from llama_index.agent.openai import OpenAIAgent
from llama_index.core.tools import QueryEngineTool, ToolMetadata
# from llama_index.postprocessor.colbert_rerank import ColbertRerank
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

tools = [
    QueryEngineTool(
        vector_index.as_query_engine(
            similarity_top_k=5, node_postprocessors=[FlagEmbeddingReranker(top_n=5)]
        ),
        metadata=ToolMetadata(
            name="search",
            description="Search the document, pass the entire user message in the query",
        ),
    ),
    QueryEngineTool(
        summary_index.as_query_engine(),
        metadata=ToolMetadata(
            name="summarize",
            description="Summarize the document using the user message",
        ),
    ),
]

agent = OpenAIAgent.from_tools(tools=tools, verbose=True)

In [None]:
# note -- this will take a while with local LLMs, its sending every node in the document to the LLM
resp = agent.chat("Tell me over all summary of the document")

In [None]:
# note -- this will take a while with local LLMs, its sending every node in the document to the LLM
resp = agent.chat("Tell me about 4-shot result numbers  across all models of MM1 ablation across different image encoders")

In [None]:
resp

In [None]:
resp = agent.chat("How do the authors evaluate their work?")


In [None]:
print(str(resp))
