<a href="https://colab.research.google.com/github/slaterlucas/flavor_base/blob/main/llama_index_neo4j_custom_retriever.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install --quiet llama-index llama-index-graph-stores-neo4j llama-index-program-openai llama-index-llms-openai

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.7/301.7 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.3/390.3 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.8/195.8 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [5]:
from google.colab import files
from dotenv import load_dotenv
import os

# Upload .env file
uploaded = files.upload()

# Load .env file
load_dotenv("env")

# Access environment variables
username = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")
url = os.getenv("NEO4J_URL")
openai_api_key = os.getenv("OPENAI_API_KEY")

# Optional: Set the OpenAI API key for the session
os.environ["OPENAI_API_KEY"] = openai_api_key

Saving env to env


In [6]:
import nest_asyncio

nest_asyncio.apply()

In [9]:
from llama_index.graph_stores.neo4j import Neo4jPGStore

username=username
password=password
url=url

graph_store = Neo4jPGStore(
    username=username,
    password=password,
    url=url,
)

In [10]:
import os

os.environ["OPENAI_API_KEY"] = openai_api_key

In [11]:
import pandas as pd
from llama_index.core import Document

food = pd.read_csv("https://raw.githubusercontent.com/slaterlucas/public-datasets/refs/heads/main/TasteTrios%20-%20Sheet1.csv")
documents = [
    Document(
        text=f"Ingredients: {row['Ingredient 1']}, {row['Ingredient 2']}, {row['Ingredient 3']}. "
             f"Compatibility: {row['Classification Output']}."
    )
    for _, row in food.iterrows()
]

# Display the first few rows to confirm
food.head()

Unnamed: 0,Ingredient 1,Ingredient 2,Ingredient 3,Classification Output
0,Pumpkin,Allspice,Bay Leaf,Highly Compatible
1,Pumpkin,Cinnamon,Ginger,Highly Compatible
2,Pumpkin,Pasta,Butter,Moderately Compatible
3,Pumpkin,Apples,Curry,Moderately Compatible
4,Pumpkin,Brown Sugar,Pine Nuts,Highly Compatible


#### Define Default LLMs

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o", temperature=0.3)
embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")

In [None]:
from typing import Literal
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor

# best practice to use upper-case
entities = Literal["INGREDIENT", "COMPATIBILITY"]
relations = Literal[
    "PAIRS_WELL_WITH",
    "HAS_COMPATIBILITY"
]

In [None]:
# define which entities can have which relations
validation_schema = {
    "Ingredient": ["PAIRS_WELL_WITH", "HAS_COMPATIBILITY"],
    "Compatibility": [],  # Acts as a label or attribute, no outgoing relationships
}

In [None]:
from llama_index.core import PropertyGraphIndex

kg_extractor = SchemaLLMPathExtractor(
    llm=llm,
    possible_entities=entities,
    possible_relations=relations,
    kg_validation_schema=validation_schema,
    # if false, allows for values outside of the schema
    # useful for using the schema as a suggestion
    strict=False,
)

NUMBER_OF_ARTICLES = 100

index = PropertyGraphIndex.from_documents(
    documents[:NUMBER_OF_ARTICLES],
    kg_extractors=[kg_extractor],
    llm=llm,
    embed_model=embed_model,
    property_graph_store=graph_store,
    show_progress=True,
)

Parsing nodes:   0%|          | 0/100 [00:00<?, ?it/s]

Extracting paths from text with schema: 100%|██████████| 100/100 [01:14<00:00,  1.34it/s]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]
Generating embeddings: 100%|██████████| 9/9 [00:01<00:00,  5.84it/s]


In [None]:
graph_store.structured_query("""
CREATE VECTOR INDEX entity IF NOT EXISTS
FOR (m:`__Entity__`)
ON m.embedding
OPTIONS {indexConfig: {
 `vector.dimensions`: 1536,
 `vector.similarity_function`: 'cosine'
}}
""")

[]

From Tomaz, Michael Hunger, and Eric Monk who took a couple of hours to perfect it.

In [None]:
# Just for inspection
similarity_threshold = 0.9
word_edit_distance = 5
data = graph_store.structured_query("""
MATCH (e:__Entity__)
CALL {
  WITH e
  CALL db.index.vector.queryNodes('entity', 10, e.embedding)
  YIELD node, score
  WITH node, score
  WHERE score > toFLoat($cutoff)
      AND (toLower(node.name) CONTAINS toLower(e.name) OR toLower(e.name) CONTAINS toLower(node.name)
           OR apoc.text.distance(toLower(node.name), toLower(e.name)) < $distance)
      AND labels(e) = labels(node)
  WITH node, score
  ORDER BY node.name
  RETURN collect(node) AS nodes
}
WITH distinct nodes
WHERE size(nodes) > 1
WITH collect([n in nodes | n.name]) AS results
UNWIND range(0, size(results)-1, 1) as index
WITH results, index, results[index] as result
WITH apoc.coll.sort(reduce(acc = result, index2 IN range(0, size(results)-1, 1) |
        CASE WHEN index <> index2 AND
            size(apoc.coll.intersection(acc, results[index2])) > 0
            THEN apoc.coll.union(acc, results[index2])
            ELSE acc
        END
)) as combinedResult
WITH distinct(combinedResult) as combinedResult
// extra filtering
WITH collect(combinedResult) as allCombinedResults
UNWIND range(0, size(allCombinedResults)-1, 1) as combinedResultIndex
WITH allCombinedResults[combinedResultIndex] as combinedResult, combinedResultIndex, allCombinedResults
WHERE NOT any(x IN range(0,size(allCombinedResults)-1,1)
    WHERE x <> combinedResultIndex
    AND apoc.coll.containsAll(allCombinedResults[x], combinedResult)
)
RETURN combinedResult
""", param_map={'cutoff': similarity_threshold, 'distance': word_edit_distance})
for row in data:
    print(row)

{'combinedResult': ['Pumpkin', 'Pumpkin Seeds']}
{'combinedResult': ['Compatible', 'Highly Compatible', 'Moderately Compatible']}
{'combinedResult': ['Garlic', 'Garlic Butter', 'Roasted Garlic']}
{'combinedResult': ['Cream', 'Cream Cheese']}
{'combinedResult': ['Kalamata Olives', 'Olive Oil', 'Olives']}
{'combinedResult': ['Onion', 'Red Onion']}
{'combinedResult': ['Lemon', 'Lemon Pepper', 'Lemon Zest']}
{'combinedResult': ['Blue Cheese', 'Brie Cheese']}
{'combinedResult': ['Dijon Mustard', 'Mustard']}
{'combinedResult': ['Mozzarella', 'Mozzarella Balls', 'Mozzarella Cheese', 'Mozzarella Sticks']}
{'combinedResult': ['Red Pepper', 'Red Pepper Flakes']}


In [None]:
graph_store.structured_query("""
MATCH (e:__Entity__)
CALL {
  WITH e
  CALL db.index.vector.queryNodes('entity', 10, e.embedding)
  YIELD node, score
  WITH node, score
  WHERE score > toFLoat($cutoff)
      AND (toLower(node.name) CONTAINS toLower(e.name) OR toLower(e.name) CONTAINS toLower(node.name)
           OR apoc.text.distance(toLower(node.name), toLower(e.name)) < $distance)
      AND labels(e) = labels(node)
  WITH node, score
  ORDER BY node.name
  RETURN collect(node) AS nodes
}
WITH distinct nodes
WHERE size(nodes) > 1
WITH collect([n in nodes | n.name]) AS results
UNWIND range(0, size(results)-1, 1) as index
WITH results, index, results[index] as result
WITH apoc.coll.sort(reduce(acc = result, index2 IN range(0, size(results)-1, 1) |
        CASE WHEN index <> index2 AND
            size(apoc.coll.intersection(acc, results[index2])) > 0
            THEN apoc.coll.union(acc, results[index2])
            ELSE acc
        END
)) as combinedResult
WITH distinct(combinedResult) as combinedResult
// extra filtering
WITH collect(combinedResult) as allCombinedResults
UNWIND range(0, size(allCombinedResults)-1, 1) as combinedResultIndex
WITH allCombinedResults[combinedResultIndex] as combinedResult, combinedResultIndex, allCombinedResults
WHERE NOT any(x IN range(0,size(allCombinedResults)-1,1)
    WHERE x <> combinedResultIndex
    AND apoc.coll.containsAll(allCombinedResults[x], combinedResult)
)
CALL {
  WITH combinedResult
	UNWIND combinedResult AS name
	MATCH (e:__Entity__ {name:name})
	WITH e
	ORDER BY size(e.name) DESC // prefer longer names to remain after merging
	RETURN collect(e) AS nodes
}
CALL apoc.refactor.mergeNodes(nodes, {properties: {
    `.*`: 'discard'
}})
YIELD node
RETURN count(*)
""", param_map={'cutoff': similarity_threshold, 'distance': word_edit_distance})

[{'count(*)': 11}]

In [None]:
from pydantic import BaseModel
from typing import Optional, List


class Entities(BaseModel):
    """List of named entities in the text such as names of ingredients and compatibility"""
    names: Optional[List[str]]


prompt_template_entities = """
Extract all named entities such as names of ingredients
from the following text:
{text}
"""

Now we can progress to the custom retriever implementation.

In [None]:
from typing import Any, Optional

from llama_index.core.embeddings import BaseEmbedding
from llama_index.core.retrievers import CustomPGRetriever, VectorContextRetriever
from llama_index.core.vector_stores.types import VectorStore
from llama_index.program.openai import OpenAIPydanticProgram


class MyCustomRetriever(CustomPGRetriever):
    """Custom retriever with cohere reranking."""

    def init(
        self,
        ## vector context retriever params
        embed_model: Optional[BaseEmbedding] = None,
        vector_store: Optional[VectorStore] = None,
        similarity_top_k: int = 4,
        path_depth: int = 1,
        include_text: bool = True,
        **kwargs: Any,
    ) -> None:
        """Uses any kwargs passed in from class constructor."""
        self.entity_extraction = OpenAIPydanticProgram.from_defaults(
            output_cls=Entities, prompt_template_str=prompt_template_entities
        )
        self.vector_retriever = VectorContextRetriever(
            self.graph_store,
            include_text=self.include_text,
            embed_model=embed_model,
            similarity_top_k=similarity_top_k,
            path_depth=path_depth,
        )

    def custom_retrieve(self, query_str: str) -> str:
        """Define custom retriever with reranking.

        Could return `str`, `TextNode`, `NodeWithScore`, or a list of those.
        """
        entities = self.entity_extraction(text=query_str).names
        result_nodes = []
        if entities:
            print(f"Detected entities: {entities}")
            for entity in entities:
                result_nodes.extend(self.vector_retriever.retrieve(entity))
        else:
            result_nodes.extend(self.vector_retriever.retrieve(query_str))
        ## TMP: please change
        final_text = "\n\n".join(
            [n.get_content(metadata_mode="llm") for n in result_nodes]
        )
        return final_text


In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine

custom_sub_retriever = MyCustomRetriever(
    index.property_graph_store,
    include_text=True,
    vector_store=index.vector_store,
    embed_model=embed_model
)

query_engine = RetrieverQueryEngine.from_args(
    index.as_retriever(sub_retrievers=[custom_sub_retriever]), llm=llm
)

### Try out some Queries

In [None]:
response = query_engine.query("I have tomato and pear in my fridge.")
print(str(response))

Detected entities: ['tomato', 'pear']
Tomato and pear pair well together, and they can be combined with goat cheese for a highly compatible dish.


### Summary
Used a lot form a blogpost, llama_index_neo4j_custom_retriever, find youtube video here https://www.youtube.com/watch?v=LDh5MdR-CPQ.