In [2]:
# bring in our LLAMA_CLOUD_API_KEY
from dotenv import load_dotenv
load_dotenv()
import os


In [3]:
#!pip install llama-index-graph-stores-neo4j

In [4]:
import nest_asyncio

nest_asyncio.apply()

# Setup Model
Here we use gpt-4o and default OpenAI embeddings.

In [5]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

llm = OpenAI(model="gpt-4o-mini")
embed_model = OpenAIEmbedding(model="text-embedding-3-small")

Settings.llm = llm
Settings.embed_model = embed_model

In [6]:
from llama_parse import LlamaParse
docs = LlamaParse(result_type="text").load_data("./attention.pdf")

Started parsing the file under job_id 06436396-ce34-425a-a749-0b8c4dd790a8


In [7]:
len(docs)

15

In [8]:
from copy import deepcopy
from llama_index.core.schema import TextNode, Document
from llama_index.core import VectorStoreIndex

def get_sub_docs(docs):
    """Split docs into pages, by separator."""
    sub_docs = []
    for doc in docs:
        doc_chunks = doc.text.split("\n---\n")
        for doc_chunk in doc_chunks:
            sub_doc = Document(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            sub_docs.append(sub_doc)

    return sub_docs

In [9]:
# this will split into pages
sub_docs = get_sub_docs(docs)

In [10]:
len(sub_docs)

15

Initialize Graph Store
Here we use Neo4j but you can also use our other integrations like Nebula (see an example notebook).

To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command

docker run \
    -p 7474:7474 -p 7687:7687 \
    -v $PWD/data:/data -v $PWD/plugins:/plugins \
    --name neo4j-apoc \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    -e NEO4JLABS_PLUGINS=\[\"apoc\"\] \
    neo4j:latest
From here, you can open the db at http://localhost:7474/. On this page, you will be asked to sign in. Use the default username/password of neo4j and neo4j.

Once you login for the first time, you will be asked to change the password.

After this, you are ready to create your first property graph!

In [13]:
NEO4J_URI = os.environ.get('NEO4J_URI')
NEO4J_USERNAME = os.environ.get('NEO4J_USERNAME')
NEO4J_PASSWORD = os.environ.get('NEO4J_PASSWORD')

In [19]:
NEO4J_URI

'neo4j+s://cd0238b0.databases.neo4j.io'

In [20]:
NEO4J_USERNAME

'neo4j'

In [21]:
NEO4J_PASSWORD

'POlRhsDkML7DgHWr8ocEDDsRPcMHDECUK6hzAOcgb94'

In [22]:
from llama_index.graph_stores.neo4j import Neo4jPGStore
graph_store = Neo4jPGStore(
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    url=NEO4J_URI,
)

In [23]:
vec_store = None

In [24]:
from llama_index.core.indices.property_graph import (
    ImplicitPathExtractor,
    SimpleLLMPathExtractor,
)
from llama_index.core import PropertyGraphIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

In [25]:
index = PropertyGraphIndex.from_documents(
    sub_docs,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        ImplicitPathExtractor(),
        SimpleLLMPathExtractor(
            llm=OpenAI(model="gpt-4o-mini", temperature=0.3),
            num_workers=4,
            max_paths_per_chunk=10,
        ),
    ],
    property_graph_store=graph_store,
    show_progress=True,
)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 15/15 [00:00<00:00, 326.07it/s]
Extracting implicit paths: 100%|██████████| 16/16 [00:00<00:00, 16031.74it/s]
Extracting paths from text: 100%|██████████| 16/16 [00:12<00:00,  1.30it/s]
Generating embeddings: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]
Generating embeddings: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it]


# run this if index is already loaded !!!!

In [33]:
index = PropertyGraphIndex.from_existing(
    graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        ImplicitPathExtractor(),
        SimpleLLMPathExtractor(
            llm=OpenAI(model="gpt-4o-mini", temperature=0.3),
            num_workers=4,
            max_paths_per_chunk=10,
        ),
    ],
    show_progress=True,
)

# Define Vector Retriever
Here we define our vector context retriever - it returns initial nodes via vector search, and traverses the relations to pull in more nodes/context.

In [26]:
from llama_index.core.indices.property_graph import VectorContextRetriever

kg_retriever = VectorContextRetriever(
    index.property_graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    similarity_top_k=2,
    path_depth=1,
    # include_text=False,
    include_text=True,
)

In [27]:
nodes = kg_retriever.retrieve(
    "explain me Encoder and Decoder Stacks"
)
# nodes = kg_retriever.retrieve('san francisco')
print(len(nodes))
for idx, node in enumerate(nodes):
    print(f">> IDX: {idx}, {node.get_content()}")

2
>> IDX: 0, Here are some facts extracted from the provided text:

Decoder -> Composed of -> Stack of n = 6 identical layers
Encoder -> Composed of -> Stack of n = 6 identical layers

output values. These are concatenated and once again projected, resulting in the final values, as
 depicted in Figure 2.
 Multi-head attention allows the model to jointly attend to information from different representation
 subspaces at different positions. With a single attention head, averaging inhibits this.

                                 MultiHead(Q, K, V ) = Concat(head1, ..., headh)W O
                                                where headi = Attention(QW i Q, KW i K, V W i V)

Where the projections are parameter matrices W i ∈ Rdmodel×dk , W i ∈ Rdmodel×dk , W i ∈ Rdmodel×dvQ        K  V
 and W O ∈ Rhdv ×dmodel .
 In this work we employ h = 8 parallel attention layers, or heads. For each of these we use
 dk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computati

# Build Baseline Vector Index
We also build a "baseline" vector index. This follows the "naive" RAG pipeline approach of chunking and vector embedding. We use this as a comparison point.

In [28]:
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine

base_index = VectorStoreIndex.from_documents(sub_docs, embed_model=embed_model)
base_retriever = base_index.as_retriever(similarity_top_k=2)
base_query_engine = RetrieverQueryEngine(base_retriever)

In [30]:
response = base_query_engine.query(
    "give me formulae for MultiHead"
)
print(str(response))

The formula for MultiHead attention is given by:

\[ 
\text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, \ldots, \text{head}_h) W_O 
\]

where 

\[ 
\text{head}_i = \text{Attention}(Q W_i^Q, K W_i^K, V W_i^V) 
\]

In this, \( W_i^Q \), \( W_i^K \), and \( W_i^V \) are parameter matrices, and \( W_O \) is another parameter matrix used for the final projection.


In [31]:
response = base_query_engine.query(
    "what is Maximum Path Length for Self-Attention"
)
print(str(response))

The maximum path length for self-attention is O(1).


In [33]:
response = base_query_engine.query(
    "Tell me BLUE scores for all model types in tabular format"
)
print(str(response))

| Model                                          | EN-DE | EN-FR |
|------------------------------------------------|-------|-------|
| ByteNet                                       | 23.75 |       |
| Deep-Att + PosUnk                             |       | 39.2  |
| GNMT + RL                                     | 24.6  | 39.92 |
| ConvS2S                                       | 25.16 | 40.46 |
| MoE                                           | 26.03 | 40.56 |
| Deep-Att + PosUnk Ensemble                     |       | 40.4  |
| GNMT + RL Ensemble                             | 26.30 | 41.16 |
| ConvS2S Ensemble                               | 26.36 | 41.29 |
| Transformer (base model)                      | 27.3  | 38.1  |
| Transformer (big)                             | 28.4  | 41.8  |


In [34]:
response = base_query_engine.query(
    "what are the Variations on the Transformer architecture? tell me in a tabulart fashion"
)
print(str(response))

| Variation | N | dmodel | dff  | h  | dk | dv | Pdrop | ϵls | train steps | PPL (dev) | BLEU (dev) | params × 10^6 |
|-----------|---|--------|------|----|----|----|-------|-----|-------------|-----------|------------|----------------|
| base      | 6 | 512    | 2048 | 8  | 64 | 64 | 0.1   | 0.1 | 100K        | 4.92      | 25.8       | 65             |
| (A)       | 1 | 512    | 512  |    |    |    |       |     |             | 5.29      | 24.9       |                |
|           | 4 | 128    | 128  |    |    |    |       |     |             | 5.00      | 25.5       |                |
|           | 16| 32     | 32   |    |    |    |       |     |             | 4.91      | 25.8       |                |
|           | 32| 16     | 16   |    |    |    |       |     |             | 5.01      | 25.4       |                |
| (B)       |   |        |      |    |    |    |       |     |             | 5.16      | 25.1       | 58             |
|           |   |        |      |    |    |    | 