In [2]:
# bring in our LLAMA_CLOUD_API_KEY
from dotenv import load_dotenv
load_dotenv()
import os

In [3]:
#!pip install llama-index-graph-stores-neo4j

In [4]:
import nest_asyncio
nest_asyncio.apply()

# Setup Model
Here we use gpt-4o and default OpenAI embeddings.

In [12]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

llm = OpenAI(model="gpt-4o-mini")
embed_model = OpenAIEmbedding(model="text-embedding-3-small")

Settings.llm = llm
Settings.embed_model = embed_model

In [13]:
from llama_parse import LlamaParse
docs = LlamaParse(result_type="text").load_data("./paper.pdf")

Started parsing the file under job_id 640116c7-3331-4712-b8c0-2e6afd504c04
.

In [14]:
len(docs)

41

In [15]:
from copy import deepcopy
from llama_index.core.schema import TextNode, Document
from llama_index.core import VectorStoreIndex

def get_sub_docs(docs):
    """Split docs into pages, by separator."""
    sub_docs = []
    for doc in docs:
        doc_chunks = doc.text.split("\n---\n")
        for doc_chunk in doc_chunks:
            sub_doc = Document(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            sub_docs.append(sub_doc)

    return sub_docs

In [16]:
# this will split into pages
sub_docs = get_sub_docs(docs)

In [17]:
len(sub_docs)

41

Initialize Graph Store
Here we use Neo4j but you can also use our other integrations like Nebula (see an example notebook).

To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command

docker run \
    -p 7474:7474 -p 7687:7687 \
    -v $PWD/data:/data -v $PWD/plugins:/plugins \
    --name neo4j-apoc \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    -e NEO4JLABS_PLUGINS=\[\"apoc\"\] \
    neo4j:latest
From here, you can open the db at http://localhost:7474/. On this page, you will be asked to sign in. Use the default username/password of neo4j and neo4j.

Once you login for the first time, you will be asked to change the password.

After this, you are ready to create your first property graph!

In [5]:
NEO4J_URI = os.environ.get('NEO4J_URI')
NEO4J_USERNAME = os.environ.get('NEO4J_USERNAME')
NEO4J_PASSWORD = os.environ.get('NEO4J_PASSWORD')

In [6]:
from llama_index.graph_stores.neo4j import Neo4jPGStore
graph_store = Neo4jPGStore(
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    url=NEO4J_URI,
)

In [25]:
NEO4J_USERNAME
NEO4J_PASSWORD
NEO4J_URI

'neo4j+s://d7ec37c2.databases.neo4j.io'

In [26]:
vec_store = None

In [7]:
from llama_index.core.indices.property_graph import (
    ImplicitPathExtractor,
    SimpleLLMPathExtractor,
)
from llama_index.core import PropertyGraphIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

In [28]:
index = PropertyGraphIndex.from_documents(
    sub_docs,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        ImplicitPathExtractor(),
        SimpleLLMPathExtractor(
            llm=OpenAI(model="gpt-4o-mini", temperature=0.3),
            num_workers=4,
            max_paths_per_chunk=10,
        ),
    ],
    property_graph_store=graph_store,
    show_progress=True,
)

Parsing nodes: 100%|██████████| 41/41 [00:00<00:00, 189.82it/s]
Extracting implicit paths: 100%|██████████| 56/56 [00:00<00:00, 56151.33it/s]
Extracting paths from text: 100%|██████████| 56/56 [00:41<00:00,  1.34it/s]
Generating embeddings: 100%|██████████| 1/1 [00:04<00:00,  4.38s/it]
Generating embeddings: 100%|██████████| 11/11 [00:04<00:00,  2.29it/s]


# run this if index is already loaded !!!!

In [8]:
index = PropertyGraphIndex.from_existing(
    graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        ImplicitPathExtractor(),
        SimpleLLMPathExtractor(
            llm=OpenAI(model="gpt-4o-mini", temperature=0.3),
            num_workers=4,
            max_paths_per_chunk=10,
        ),
    ],
    show_progress=True,
)

# Define Vector Retriever
Here we define our vector context retriever - it returns initial nodes via vector search, and traverses the relations to pull in more nodes/context.

In [9]:
from llama_index.core.indices.property_graph import VectorContextRetriever

kg_retriever = VectorContextRetriever(
    index.property_graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    similarity_top_k=2,
    path_depth=1,
    # include_text=False,
    include_text=True,
)

In [10]:
nodes = kg_retriever.retrieve(
    "explain me Encoder and Decoder Stacks"
)
# nodes = kg_retriever.retrieve('san francisco')
print(len(nodes))
for idx, node in enumerate(nodes):
    print(f">> IDX: {idx}, {node.get_content()}")

2
>> IDX: 0, Here are some facts extracted from the provided text:

Llama -> Is -> Open and efficient foundation language models

: Generative multimodal models are in-context learners. arXiv
       preprint arXiv:2312.13286 (2023)
106. Team, G., Anil, R., Borgeaud, S., Wu, Y., Alayrac, J.B., Yu, J., Soricut, R.,
       Schalkwyk, J., Dai, A.M., Hauth, A., et al.: Gemini: a family of highly capable
       multimodal models. arXiv preprint arXiv:2312.11805 (2023)
107. Thoppilan, R., De Freitas, D., Hall, J., Shazeer, N., Kulshreshtha, A., Cheng,
       H.T., Jin, A., Bos, T., Baker, L., Du, Y., et al.: Lamda: Language models for
       dialog applications. arXiv preprint arXiv:2201.08239 (2022)
108. Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? explor-
       ing the visual shortcomings of multimodal llms. arXiv preprint arXiv:2401.06209
       (2024)
109. Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.A., Lacroix, T.,
       Rozière, B., Goyal

# Build Baseline Vector Index
We also build a "baseline" vector index. This follows the "naive" RAG pipeline approach of chunking and vector embedding. We use this as a comparison point.

In [19]:
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine

base_index = VectorStoreIndex.from_documents(sub_docs, embed_model=embed_model)
base_retriever = base_index.as_retriever(similarity_top_k=2)
base_query_engine = RetrieverQueryEngine(base_retriever)

In [20]:
response = base_query_engine.query(
    "Tell me over all summary of the document"
)
print(str(response))

The document discusses the AFM-Multimodal v3 model, detailing its capabilities in processing and extracting information from multimodal data, specifically images and text. It highlights the distinction between evaporation and evapotranspiration, using a flowchart to illustrate the processes involved in project suggestion and decision-making within a team context. The flowchart is color-coded to differentiate between actions and decisions, providing a clear visual representation of the sequence of steps. Additionally, the document outlines the dataset construction process, which includes interleaved image-text documents and text-only data, emphasizing the filtering and de-duplication methods used to ensure high-quality inputs. Overall, it showcases the model's ability to handle complex multimodal tasks and the structured approach taken in data preparation.


In [21]:
response = base_query_engine.query(
    "Tell me about 4-shot result numbers  across all models of MM1 ablation across different image encoders"
)
print(str(response))

The 4-shot result numbers across all models of the MM1 ablation with different image encoders are as follows:

- AIM600M: 56.6
- AIM3B: 60.9
- CLIPDFN+VeCap (ViT-L): 62.6
- CLIPDFN (ViT-H): 62.5
- CLIPDFN+VeCap (ViT-H): 60.0
- CLIPOpenAI (ViT-L): 62.2
- CLIPDFN (ViT-H): 62.5

These values indicate the performance of each model under the 4-shot scenario.


In [36]:
response = base_query_engine.query(
    "what is the token size for test only data type  in pre training data ablation?"
)
print(str(response))

The token size for the test-only data type in pre-training data ablation is not explicitly mentioned in the provided information.


In [None]:
response = base_query_engine.query(
    "what are the Variations on the Transformer architecture? tell me in a tabulart fashion"
)
print(str(response))

# Build Custom Retriever
Build joint retriever that combines vector and KG search.

In [22]:
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore
from typing import List


class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both KG vector search and direct vector search."""

    def __init__(self, kg_retriever, vector_retriever):
        self._kg_retriever = kg_retriever
        self._vector_retriever = vector_retriever

    def _retrieve(self, query_bundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""
        kg_nodes = self._kg_retriever.retrieve(query_bundle)
        vector_nodes = self._vector_retriever.retrieve(query_bundle)

        unique_nodes = {n.node_id: n for n in kg_nodes}
        unique_nodes.update({n.node_id: n for n in vector_nodes})
        return list(unique_nodes.values())

In [23]:
custom_retriever = CustomRetriever(kg_retriever, base_retriever)


In [24]:
nodes = custom_retriever.retrieve(
    "Give me all the programs that the mayor's budget includes"
)
# len(nodes)

# Build Agent
Now that we have the retriever, we can treat it as a RAG pipeline tool, and wrap it with an agent that can perform basic CoT reasoning and maintain conversation memory over time.

In [26]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RetrieverQueryEngine

kg_query_engine = RetrieverQueryEngine(custom_retriever)
kg_query_tool = QueryEngineTool(
    query_engine=kg_query_engine,
    metadata=ToolMetadata(
        name="query_tool",
        description="Provides information about Methods, Analysis & Insights from Multimodal LLM Pre-training.",
    ),
)

In [27]:
from llama_index.core.agent import FunctionCallingAgentWorker

agent_worker = FunctionCallingAgentWorker.from_tools(
    [kg_query_tool],
    llm=llm,
    verbose=True,
    allow_parallel_tool_calls=False,
)
agent = agent_worker.as_agent()

In [29]:
response = agent.chat("Tell me about 4-shot result numbers  across all models of MM1 ablation across different image encoders")


Added user message to memory: Tell me about 4-shot result numbers  across all models of MM1 ablation across different image encoders
=== Calling Function ===
Calling function: query_tool with args: {"input": "4-shot result numbers across all models of MM1 ablation across different image encoders"}
=== Function Output ===
The 4-shot result numbers across all models of MM1 ablation across different image encoders are as follows:

- AIM600M: 56.6
- AIM1B: 59.5
- AIM3B: 60.9
- CLIPDFN+VeCap (ViT-L): 62.6
- CLIPDFN (ViT-H): 62.5
- CLIPOpenAI (ViT-L): 62.2
- CLIPDFN (ViT-H): 62.5
- CLIPDFN+VeCap (ViT-H): 63.6

These values represent the performance of each model in the 4-shot evaluation setting.
=== LLM Response ===
The 4-shot result numbers across all models of MM1 ablation across different image encoders are as follows:

- **AIM600M**: 56.6
- **AIM1B**: 59.5
- **AIM3B**: 60.9
- **CLIPDFN+VeCap (ViT-L)**: 62.6
- **CLIPDFN (ViT-H)**: 62.5
- **CLIPOpenAI (ViT-L)**: 62.2
- **CLIPDFN (ViT-H)**:

In [30]:
response = agent.chat("what is the token size for test only data type  in pre training data ablation?")


Added user message to memory: what is the token size for test only data type  in pre training data ablation?
=== Calling Function ===
Calling function: query_tool with args: {"input": "token size for test only data type in pre training data ablation"}
=== Function Output ===
The token size for the text-only data type in the pre-training data ablation is 2 trillion tokens.
=== LLM Response ===
The token size for the text-only data type in the pre-training data ablation is 2 trillion tokens.
