# The Start
In this notebook, I walk through building and retrieving from a neo4j knowledge graph.

I start with the same text_nodes used to build the vector index. This way the results can be combined and reranked to improve the quality of the retrieval results.


In [3]:
# This notebook is in the eval folder.  Change to the root folder.
%cd ..
%pwd  # To verify the current working directory

c:\Users\happy\Documents\Projects\askgrowbuddy


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


'c:\\Users\\happy\\Documents\\Projects\\askgrowbuddy'

In [2]:
# --->: Read in the markdown files in the Obsidian vault directory
from src.ingest_service import IngestService
from src.doc_stats import DocStats
# The Directory containing the knowledge documents used by the AI to do the analysis on the soil tests.
soil_knowledge_directory = r"G:\My Drive\Audios_To_Knowledge\knowledge\AskGrowBuddy\AskGrowBuddy\Knowledge\soil_test_knowlege\test"
# Load the documents
ingest_service = IngestService()
loaded_documents = ingest_service.load_obsidian_notes(soil_knowledge_directory)
# Show some summary stats about the documents

DocStats.print_llama_index_docs_summary_stats(loaded_documents)

resource module not available on Windows


{'Document Type': 'Document',
 'Total Documents': 3,
 'Avg Content Length': '19989.7',
 'All Docs Have Content': True,
 'All Docs Have Metadata': True,
 'Shortest Doc Length': 57,
 'Longest Doc Length': 52741}

In [3]:
text_nodes = ingest_service.chunk_text(loaded_documents)
DocStats.print_llama_index_docs_summary_stats(text_nodes)

{'Document Type': 'TextNode',
 'Total Documents': 35,
 'Avg Content Length': '1710.5',
 'All Docs Have Content': True,
 'All Docs Have Metadata': True,
 'Shortest Doc Length': 58,
 'Longest Doc Length': 7159}

In [16]:
def check_disconnected_entities(session):
    query = """
    MATCH (e:Entity)
    WHERE NOT (e)<-[:CONTAINS]-(:TextNode)
    RETURN e.name as entity
    """
    result = session.run(query)
    disconnected = [record["entity"] for record in result]
    print(f"Found {len(disconnected)} disconnected entities:")
    for entity in disconnected:
        print(f"- {entity}")

In [2]:
def retrieve_from_knowledge_graph(search_term: str, k: int = 5):
    """
    Retrieve top k most relevant nodes based on graph relationships.
    Uses relationship patterns and connection density to rank relevance.

    Args:
        search_term: Search term/concept
        k: Number of results to return
    """
    driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "asd@123qwe"))

    try:
        with driver.session(database="soiltestknowledge") as session:
            # Find nodes and their relationship counts, ordered by connectivity
            result = session.run("""
                MATCH (start:Entity)
                WHERE start.name CONTAINS $search_term
                WITH start
                MATCH (start)-[r]-(connected)
                WITH start, connected, count(r) as rel_count
                RETURN DISTINCT start.name as entity,
                       collect(connected.name) as connected_entities,
                       count(connected) as connection_count
                ORDER BY connection_count DESC
                LIMIT $k
            """,
            search_term=search_term.lower(),
            k=k
            )

            return [(record["entity"],
                    record["connected_entities"],
                    record["connection_count"])
                    for record in result]
    finally:
        driver.close()

In [33]:
from src.ask_question import ask_question
def get_search_terms_prompt(text: str, max_terms: int = 5) -> str:
    """Returns prompt template for extracting search terms from a query."""
    return f"""Extract ONLY the most specific technical terms from the query. Maximum {max_terms} terms.

STRICT RULES - YOU MUST FOLLOW THESE:
1. ONLY INCLUDE:
   - Words that are in the query
   - Technical measurements (pH, EC, PPM, meq)
   - Chemical names (N, P, K, calcium, magnesium)
   - Specific soil properties (CEC, bulk density)
   - Technical conditions (anaerobic, aerobic)

2. NEVER INCLUDE:
   - Words that are not in the query
   - The word 'cannabis' or 'marijuana'
   - The word 'plant' or 'plants'
   - The word 'soil' by itself
   - The word 'ideal' or 'optimal'
   - The word 'good' or 'best'
   - The word 'grow' or 'growing'
   - Any other generic descriptors

If no technical terms are found, return EMPTY string.
Return only comma-separated terms, no explanations.

Examples:
Query: What's the ideal pH range for cannabis in soil?
Terms: pH

Query: What are options for Calcium nutrients?
Terms: calcium, nutrients

Query: How to fix nitrogen deficiency in plants?
Terms: nitrogen, deficiency

Query: {text}
Terms:"""

def extract_search_terms_llm(query: str, max_terms: int = 5, model_name: str = 'mistral_soil') -> list[str]:
    """Uses LLM to extract key search terms from query."""
    prompt = get_search_terms_prompt(query, max_terms)
    response = ask_question(prompt.format(text=query), model_name=model_name)
    # Convert response to lowercase for comparison
    query_lower = query.lower()

    # Only accept terms that actually appear in the query
    terms = [term.strip() for term in response['answer'].split(',') if term.strip()]
    valid_terms = [term for term in terms if term.lower() in query_lower]

    return valid_terms


In [36]:
query = "What magnesium nutrients do you recommend?"
valid_terms = extract_search_terms_llm(query)
print("LLM:", valid_terms)

LLM: ['magnesium', 'nutrients']


In [30]:
from neo4j import GraphDatabase
from llama_index.core.schema import NodeWithScore, TextNode, MetadataMode

def retrieve_from_knowledge_graph(search_terms: list[str], k: int = 5, max_score: float = 0.95):
    """
    Retrieve top k most relevant nodes based on graph relationships.
    Returns list of NodeWithScore objects with text and metadata from Neo4j.
    """
    driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "asd@123qwe"))

    try:
        with driver.session(database="soiltestknowledge") as session:  # Check your database name!
            # Debug: First check if we can find the entities
            print("Searching for entities...")
            scoring_result = session.run("""
                MATCH (start:Entity)
                WHERE any(term IN $terms WHERE toLower(start.name) CONTAINS toLower(term))
                RETURN start.name as entity
            """,
            terms=search_terms
            )

            entities = [record["entity"] for record in scoring_result]
            print(f"Found entities: {entities}")

            if not entities:
                print("No matching entities found!")
                return []

            # Now get scores for found entities
            scoring_result = session.run("""
                MATCH (start:Entity)
                WHERE start.name IN $entities

                OPTIONAL MATCH (start)-[r]-(connected:Entity)
                WHERE type(r) <> 'CONTAINS' AND type(r) <> 'MENTIONED_IN'

                WITH start.name as entity,
                     count(DISTINCT connected) as conn_count
                WITH collect({entity: entity, count: conn_count}) as results,
                     max(conn_count) as max_count
                WHERE max_count > 0

                UNWIND results as result
                RETURN
                    result.entity as entity,
                    toFloat(result.count) / toFloat(max_count) * $max_score as score
                ORDER BY score DESC
                LIMIT $k
            """,
            entities=entities,
            max_score=max_score,
            k=k
            )

            scored_entities = [(record["entity"], record["score"]) for record in scoring_result]
            print(f"Scored entities: {scored_entities}")

            # Get content for scored entities
            nodes_with_scores = []
            for entity, score in scored_entities:
                print(f"\nFetching content for {entity}")
                content_result = session.run("""
                    MATCH (e:Entity {name: $entity})<-[:CONTAINS]-(text:TextNode)
                    RETURN
                        text.text as text,
                        text.metadata as metadata,
                        text.source as source
                    LIMIT 1
                """,
                entity=entity
                )

                data = content_result.single()
                if data is None:
                    print(f"No TextNode found for {entity}")
                    continue

                # Get connections in separate query
                connections_result = session.run("""
                    MATCH (e:Entity {name: $entity})-[r]-(connected:Entity)
                    WHERE type(r) <> 'CONTAINS' AND type(r) <> 'MENTIONED_IN'
                    RETURN collect(DISTINCT {
                        entity: connected.name,
                        relationship: type(r)
                    }) as connections
                """,
                entity=entity
                )

                connections = connections_result.single()["connections"]

                # Create metadata dictionary
                metadata = {}
                if data.get("metadata") and isinstance(data["metadata"], dict):
                    metadata.update(data["metadata"])

                metadata.update({
                    "source": data.get("source"),
                    "entity": entity,
                    "graph_connections": [
                        f"{conn['entity']} ({conn['relationship']})"
                        for conn in connections
                        if conn.get('entity')
                    ]
                })

                # Create NodeWithScore
                node = TextNode(
                    text=data.get("text", ""),
                    metadata=metadata,
                    id_=f"graph_entity_{entity}"
                )
                nodes_with_scores.append(NodeWithScore(node=node, score=score))

            return nodes_with_scores
    finally:
        driver.close()

In [32]:
# Test the function
results = retrieve_from_knowledge_graph(["calcium", "magnesium"], k=5)
print(f"\nTotal results returned: {len(results)}")
for node_with_score in results:
    print(f"\nScore: {node_with_score.score:.3f}")
    print(f"Entity: {node_with_score.node.metadata['entity']}")
    print("Text content:")
    print(node_with_score.node.get_content(metadata_mode=MetadataMode.NONE))
    print("Graph Connections:")
    print("\n".join(node_with_score.node.metadata["graph_connections"]))


Searching for entities...




Found entities: ['calcium', 'free_calcium_detection', 'high_levels_can_cause_calcium_and_other_elements_to_precipitate_out', 'Calcium_atoms', 'magnesium_atoms', '7_milliequivalents_of_calcium', 'cations_like_calcium_magnesium_and_potassium', 'calcium_to_magnesium_ratio', 'ideal_calcium_to_magnesium_ratio', 'magnesium_uptake', 'soil_deficiency_in_calcium', 'lack_of_calcium', 'magnesium', 'magnesium_deficiency', 'calcium_deficiencies_in_tissues', 'calcium_nutrients', '1.0_mEq_of_calcium', '1.0_mEq_of_magnesium', 'approximately_68_percent_calcium', 'calcium_carbonate_equivalents or milliequivalents or bicarbonates', 'calcium_in_plant', 'calcium_absorption', 'calcium_channels', 'calcium_movement_into_cell', 'calcium_transport_energy', 'calcium_transport_inside_root_cells', 'calcium_movement_within_xylem', 'calcium_mobility_within_plant', 'calcium_transported_to_new_leaves_and_shoots', 'calcium_transported_to_shoots', 'calcium_transported_to_developing_fruits', 'calcium_and_organic_fertiliz

Our graph has two main types of nodes:

- Entity nodes: Represent concepts with name properties
- TextNode nodes: Represent content chunks with text properties

And two main types of relationships:

- MENTIONED_IN: Links Entities to the TextNodes where they appear
- Semantic relationships (like IMPACTS, CRITICAL_FOR, etc.): Link Entities to other Entities

In [52]:
def inspect_problematic_nodes():
    driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "asd@123qwe"))
    try:
        with driver.session(database="test") as session:
            # Only look for Entity nodes with missing names
            result = session.run("""
                MATCH (n:Entity)
                WHERE n.name IS NULL OR n.name = ''
                WITH n, labels(n) as node_labels
                OPTIONAL MATCH (n)-[r]-()
                RETURN
                    node_labels,
                    n,
                    count(r) as relationship_count,
                    collect(DISTINCT type(r)) as relationship_types
                LIMIT 10
            """)

            print("Entity nodes with missing/null names:")
            print("-" * 50)
            for record in result:
                print(f"Labels: {record['node_labels']}")
                print(f"Properties: {dict(record['n'])}")
                print(f"Relationship count: {record['relationship_count']}")
                print(f"Relationship types: {record['relationship_types']}")
                print("-" * 30)

    finally:
        driver.close()

inspect_problematic_nodes()

ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)

In [19]:
# Query Neo4j for all unique source files
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "asd@123qwe"))
database = "test"
try:
    with driver.session(database=database) as session:
        # Query for distinct source files from TextNode nodes
        result = session.run("""
            MATCH (n:TextNode)
            WHERE n.source IS NOT NULL
            RETURN DISTINCT n.source as source, count(*) as count
            ORDER BY count DESC
        """)

        print("Source files in knowledge graph:")
        print("-" * 50)
        total_nodes = 0
        for record in result:
            source = record["source"]
            count = record["count"]
            total_nodes += count
            print(f"{source}: {count} nodes")
        print("-" * 50)
        print(f"Total TextNodes: {total_nodes}")

finally:
    driver.close()

Source files in knowledge graph:
--------------------------------------------------
soil science notes.md: 33 nodes
ph.md: 1 nodes
Focusing on Calcium Nutrition.md: 1 nodes
--------------------------------------------------
Total TextNodes: 35


In [20]:
# Query Neo4j for source statistics with detailed file listings
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "asd@123qwe"))
database = "test"
try:
    with driver.session(database=database) as session:
        result = session.run("""
            MATCH (n:TextNode)
            WHERE n.source IS NOT NULL
            RETURN DISTINCT n.source as source, count(*) as count
            ORDER BY count DESC
        """)

        # Collect results
        sources = [(record["source"], record["count"]) for record in result]
        total_nodes = sum(count for _, count in sources)

        print("Knowledge Graph Source Statistics")
        print("=" * 50)

        # Distribution by size with file listings
        print("\nFile Size Distribution:")
        print("\nLarge files (>10 nodes):")
        print("-" * 30)
        for source, count in sources:
            if count > 10:
                print(f"{source:<50} {count:>3} nodes")

        print("\nMedium files (5-10 nodes):")
        print("-" * 30)
        for source, count in sources:
            if 5 <= count <= 10:
                print(f"{source:<50} {count:>3} nodes")

        print("\nSmall files (2-4 nodes):")
        print("-" * 30)
        for source, count in sources:
            if 2 <= count <= 4:
                print(f"{source:<50} {count:>3} nodes")

        print("\nTiny files (1 node):")
        print("-" * 30)
        for source, count in sources:
            if count == 1:
                print(f"{source:<50} {count:>3} nodes")

        # Overall stats
        print("\nOverall Statistics:")
        print("-" * 30)
        print(f"Total files:          {len(sources)}")
        print(f"Total nodes:          {total_nodes}")
        print(f"Average nodes/file:   {total_nodes/len(sources):.1f}")
        print(f"Median nodes/file:    {sorted([c for _, c in sources])[len(sources)//2]}")

finally:
    driver.close()

Knowledge Graph Source Statistics

File Size Distribution:

Large files (>10 nodes):
------------------------------
soil science notes.md                               33 nodes

Medium files (5-10 nodes):
------------------------------

Small files (2-4 nodes):
------------------------------

Tiny files (1 node):
------------------------------
ph.md                                                1 nodes
Focusing on Calcium Nutrition.md                     1 nodes

Overall Statistics:
------------------------------
Total files:          3
Total nodes:          35
Average nodes/file:   11.7
Median nodes/file:    1


In [16]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "asd@123qwe"))
database = "test"
try:
    with driver.session(database=database) as session:
        # Count knowledge relationships (excluding CONTAINS)
        result = session.run("""
            MATCH ()-[r]->()
            WHERE type(r) <> 'CONTAINS'
            WITH type(r) as rel_type, count(r) as count
            RETURN rel_type, count
            ORDER BY count DESC

        """)

        print("Knowledge Relationships")
        print("=" * 50)
        print(f"{'Relationship Type':<30} {'Count':>10}")
        print("-" * 50)

        relationships = [(record["rel_type"], record["count"]) for record in result]
        knowledge_rels = sum(count for _, count in relationships)

        for rel_type, count in relationships:
            percentage = (count / knowledge_rels) * 100
            print(f"{rel_type:<30} {count:>10} ({percentage:>5.1f}%)")

        print("-" * 50)
        print(f"Total Knowledge Relationships: {knowledge_rels}")

        # Sample some actual triplets
        print("\nSample Knowledge Triplets:")
        result = session.run("""
            MATCH (a)-[r]->(b)
            WHERE type(r) <> 'CONTAINS'
            RETURN a.name as subject, type(r) as relation, b.name as object
            LIMIT 5
        """)

        for record in result:
            print(f"({record['subject']}) --[{record['relation']}]--> ({record['object']})")

finally:
    driver.close()

Knowledge Relationships
Relationship Type                   Count
--------------------------------------------------
MENTIONED_IN                          462 ( 63.4%)
CAUSES                                 13 (  1.8%)
EQUIVALENT_TO                           8 (  1.1%)
AFFECTS                                 7 (  1.0%)
IS                                      6 (  0.8%)
MOVES_PREDOMINANTLY_VIA                 4 (  0.5%)
REQUIRES                                4 (  0.5%)
CRITICAL_FOR                            3 (  0.4%)
USES                                    3 (  0.4%)
IS_A                                    3 (  0.4%)
ASSOCIATED_WITH                         3 (  0.4%)
CAN_BE_INDUCED_BY                       3 (  0.4%)
IS_INVOLVED_IN                          3 (  0.4%)
INVOLVED_IN                             3 (  0.4%)
REDUCE                                  3 (  0.4%)
PLAYS_MANY_ROLES_IN_PLANTS              2 (  0.3%)
COMPRISES_APPROXIMATELY                 2 (  0.3%)
APPLIED_IN_EMERG

In [None]:
import pandas as pd

data = [
    {
        'Subject': 'soil_testing',
        'Relationship': 'IS_IMPORTANT_FOR',
        'Object': 'cannabis_growing',
        'Source': 'Soil Testing 101- How To Properly Take A Soil Sample.md'
    },
    {
        'Subject': 'Small_pots',
        'Relationship': 'MAY_CAUSE',
        'Object': 'running_out_of_nutrients',
        'Source': None
    },
    {
        'Subject': 'website_at_kisorganics_com',
        'Relationship': 'OFFERS',
        'Object': 'soil_testing_and_consultations',
        'Source': 'Soil Testing 101- How To Properly Take A Soil Sample.md'
    },
    {
        'Subject': 'Organic_options',
        'Relationship': 'APPLIES_TO',
        'Object': 'nutrient_deficiency_in_plants',
        'Source': None
    },
    {
        'Subject': 'soil_sample',
        'Relationship': 'NEEDS',
        'Object': 'a_plastic_ziploc_bag',
        'Source': 'Soil Testing 101- How To Properly Take A Soil Sample.md'
    }
]

df = pd.DataFrame(data)

print("Knowledge Graph Triplets:")
print(df)

print("\nBasic Statistics:")
print(f"Number of triplets with source: {df['Source'].notna().sum()}")
print(f"Number of triplets without source: {df['Source'].isna().sum()}")