In [1]:
import json
from neo4j import GraphDatabase
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain.llms import Ollama
import os
from langchain_community.graphs import Neo4jGraph
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI
from neo4j import GraphDatabase 
import re
import ast
import openai
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai import OpenAI
from typing import List


In [2]:
from neo4j import GraphDatabase
from langchain_openai import ChatOpenAI

In [3]:
uri = os.environ["NEO4J_URI"]
user=os.environ["NEO4J_USERNAME"]
password = os.environ["NEO4J_PASSWORD"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] 

driver = GraphDatabase.driver(uri, auth=(user, password))
llm = ChatOpenAI(temperature=0, model_name="gpt-4o", api_key=OPENAI_API_KEY)

In [4]:
from langchain.embeddings import OpenAIEmbeddings
embedder = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)  # or your desired model


  embedder = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)  # or your desired model


In [5]:
# Utility: Convert string to list
def str_to_list(answer):
    match = re.search(r"\[.*\]", answer)
    try:
        if match:
            match2 = match.group(0)
            pattern = r"(?<=\[)(.*?)(?=\])"
            ans = re.sub(pattern, "", match2)
            answer_list = ast.literal_eval(ans)
        else:
            answer_list = []
    except:
        answer_list = []
    return answer_list

In [6]:
client = openai.OpenAI(api_key=OPENAI_API_KEY)

def ask_openai(question, model="gpt-4o"):
    """Sends a question to OpenAI's API and returns the response."""
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": question}]
    )
    return response.choices[0].message.content

In [7]:
# Function to get all nodes with embeddings
def fetch_all_embedded_nodes():
    with driver.session() as session:
        results = session.run("MATCH (n) WHERE n.embedding IS NOT NULL RETURN n")
        return [record["n"] for record in results]


In [8]:
# Function to search similar nodes using cosine similarity
def search_similar_nodes(query: str, top_k: int = 2):
    query_embedding = np.array(embedder.embed_query(query))
    nodes = fetch_all_embedded_nodes()

    scored = []
    for node in nodes:
        node_embedding = np.array(node["embedding"])
        score = cosine_similarity([query_embedding], [node_embedding])[0][0]
        scored.append((node, score))

    scored.sort(key=lambda x: x[1], reverse=True)
    return scored[:top_k]

def build_context_from_nodes(similar_nodes):
    stripped_context = []
    for node, _ in similar_nodes:  # Unpack tuple (node, score)
        # Copy properties and remove 'embedding'
        node_props = dict(node._properties)  # or node['properties'] depending on how you access it
        node_props.pop('embedding', None)  # Remove embedding if it exists
        stripped_context.append({
            "labels": list(node.labels),
            "properties": node_props
        })
    return f"Use the following context:\n{stripped_context}"


In [9]:
# Ask LLM
def ask_llm(context: str, query: str):
    system_prompt = "Answer the question using only the provided context. If not found, say 'Not in knowledge base.'"
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{query}"}
    ]
    client = OpenAI(api_key=OPENAI_API_KEY)
    completion = client.chat.completions.create(model="gpt-4", messages=messages)
    return completion.choices[0].message.content

In [10]:
def fetch_related_nodes_and_relationships(driver, similar_nodes):
    related_data = []

    with driver.session() as session:
        for node, _ in similar_nodes:
            node_id = node.get("id") or node["id"]  # Get the 'id' property

            # Cypher query to fetch all connected nodes + relationships
            cypher = """
            MATCH (n {id: $node_id})-[r]-(m)
            RETURN n, r, m
            """
            results = session.run(cypher, node_id=node_id)

            for record in results:
                n = record["n"]
                r = record["r"]
                m = record["m"]

                related_data.append({
                    "node": n,
                    "relationship": r,
                    "related_node": m
                })

    return related_data


In [11]:
def format_context(similar_nodes, related_info):
    context_lines = []

    for node, _ in similar_nodes:
        node_props = dict(node)
        node_props.pop("embedding", None)

        # 🔹 Include main node header
        context_lines.append(f"Main Node: {node_props.get('name')} ({node_props.get('id')})")

        # 🔹 Include all properties (pretty format)
        props_str = "\n".join([f"  - {k}: {v}" for k, v in node_props.items()])
        context_lines.append(f"Properties:\n{props_str}")

    for entry in related_info:
        node_a = dict(entry["node"])
        node_b = dict(entry["related_node"])
        rel = entry["relationship"]

        node_a.pop("embedding", None)
        node_b.pop("embedding", None)

        context_lines.append(f"{node_a.get('name')} --[{rel.type}]--> {node_b.get('name')}")

    return "\n".join(context_lines)



In [12]:
def run_pipeline(user_query: str):
    print("🔎 Running similarity search...")
    similar_nodes = search_similar_nodes(user_query)
    print("✅ Top nodes retrieved.")
    print("!!!!!!!")
    print(similar_nodes)
    
    
    print("🧾 Fetching related nodes + relationships...")
    related_info = fetch_related_nodes_and_relationships(driver, similar_nodes)
    print("✅ Related nodes retrieved.")
    print("@@@@@")
    print(related_info)

    print("🧾 Formatting context...")
    context = format_context(similar_nodes, related_info)
    print("#####")
    print(context)

    print("🤖 Asking LLM...")
    final_answer = ask_llm(context, user_query)
    return final_answer


In [13]:
query = "recorded can't help falling in love with you"
answer = run_pipeline(query)
print("\n💡 Final Answer:\n", answer)


🔎 Running similarity search...
✅ Top nodes retrieved.
!!!!!!!
[(<Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:5343' labels=frozenset({'song'}) properties={'documentId': 'merged_doc11', 'id': 'cant_help_falling_in_love', 'embedding': [-0.02463804767587276, 0.0024472305796159684, 0.008036793609690618, -0.02283856494260078, -0.009024479761234263, 0.020443762232534655, -0.01155458128070677, -0.028872923879619795, -0.02642400201832012, 0.0020278020352821574, -0.009112424546141919, -0.004914756073775145, -0.021715578119005728, -0.007820314210788892, -0.008828295742537002, 0.01593828654732983, 0.03374370709527986, 0.0034974934549648284, 0.016709494231294263, -0.02742521842333341, -0.010614250084984364, -0.010221881116267328, 0.023555651612686638, -0.020064922585964748, -0.013976443566389671, -0.010485715470990292, 0.020037863941670488, -0.04332291793554437, -0.006288046532962262, 0.007096461482645699, 0.030929477703524947, 0.01254903349030775, -0.03125419633621628, -0.0525774064178

In [14]:
query = "how many episodes are in chicago fire season 4"
answer = run_pipeline(query)
print("\n💡 Final Answer:\n", answer)

🔎 Running similarity search...
✅ Top nodes retrieved.
!!!!!!!
[(<Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:5197' labels=frozenset({'television_series'}) properties={'premiereDate': '2015-10-13', 'endDate': '2016-05-17', 'episodeCount': 23, 'name': 'chicago fire season 4', 'documentId': 'merged_doc1', 'id': 'chicago_fire_season_4', 'embedding': [0.0037711296162565096, -0.02115946553405582, -0.011402524564770098, -0.043991940249707887, -0.018389399380373265, 0.016442125024432902, -0.02986734616959609, -0.01553705367438748, -0.030251315551032772, -0.003808840961313508, 0.023847251204112598, 0.010106627436760024, -0.0006470915726466993, 0.0041002461956550044, 0.008632458411087545, -0.004888755079434813, 0.026973861153119956, 0.0012796128370101967, 0.009249552725055456, -0.028413747264830057, 0.0028592018696831194, 0.025698534146016326, 0.0061983654661620915, -0.006256646699294898, 0.0054509963585339, 0.006565193856278854, 0.01501595263710128, -0.025931659078547557, 0.00662004

In [15]:
query = "What was XYZ Comapny involved in?"
answer = run_pipeline(query)
print("\n💡 Final Answer:\n", answer)

🔎 Running similarity search...
✅ Top nodes retrieved.
!!!!!!!
[(<Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:5298' labels=frozenset({'organization'}) properties={'name': 'XYZ Company', 'documentId': 'merged_doc7', 'id': 'xyz_company', 'embedding': [0.001953305289429373, -0.01965059605248306, 0.0022834656746590545, -0.022914172139884684, -0.03979902874459173, 0.0332165597319232, -0.017534804303913402, -0.022181251046525647, 0.021296213305650187, -3.2259112163640005e-06, 0.0031840604612768356, 0.0024978107379754143, 0.028238225000635924, -0.017507145884980767, -0.00947958082810018, -0.0005514198204485465, 0.007042270436818044, -0.01895916165620036, 0.01501797945065974, 0.010502905075703135, -0.004615331719804994, 0.014907349500219654, -0.0019671340332343837, -0.01324790396890882, 0.014630775555442052, 0.0058772016816290564, 0.011457086069942342, -0.0026602981284383105, 0.01576472882216248, 0.01724440114966948, 0.012259151347987739, 0.013282475595590693, -0.013489906287004548,

In [16]:
query="What was XYZ Comapny involved in?"
# Example Usage
try:

    # Query the LLM with the extracted context
    matched_results, top_doc_id = create_keywords(query)
    print(f"*********")
    print(matched_results)
    print(top_doc_id)
    answer = query_llm_with_context(llm, query, matched_results)
    print(f"*********\n Answer: {answer}")
finally:
    print("Done!!")

Done!!


NameError: name 'create_keywords' is not defined

In [None]:
query="recorded can't help falling in love with you"
# Example Usage
try:

    # Query the LLM with the extracted context
    matched_results, top_doc_id = create_keywords(query)
    print(f"*********")
    answer = query_llm_with_context(llm, query, matched_results)
    print(f"*********\n Answer: {answer}")
finally:
    print("Done!!")

Extracted Keywords: ['record', 'help', 'fall', 'love']
Top 10 Matches for Keywords: ['record', 'help', 'fall', 'love']
Rank 1:
Match Percentage: 75.00%
Matched Keywords: ['help', 'fall', 'love']
Node A (ID: 4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2559)
 - Properties: ['id', 'name', 'documentId']
 - Values: ['royal_philharmonic_orchestra', 'royal philharmonic orchestra', 'merged_doc11']
Relationship (PERFORMED_WITH)
 - Properties: ['year', 'documentId', 'context']
 - Values: ['2015', 'merged_doc11', 'If I Can Dream album']
Node B (ID: 4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:3745)
 - Properties: ['id', 'name', 'documentId']
 - Values: ['cant_help_falling_in_love', "can't help falling in love", 'merged_doc11']
----------------------------------------
Rank 2:
Match Percentage: 75.00%
Matched Keywords: ['help', 'fall', 'love']
Node A (ID: 4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:3745)
 - Properties: ['id', 'name', 'documentId']
 - Values: ['cant_help_falling_in_love', "can't help falling in lov

In [None]:
query="what was the name of atom bomb dropped by usa on hiroshima"
# Example Usage
try:

    # Query the LLM with the extracted context
    matched_results, top_doc_id = create_keywords(query)
    print(f"*********")
    answer = query_llm_with_context(llm, query, matched_results)
    print(f"*********\n Answer: {answer}")
finally:
    print("Done!!")

Extracted Keywords: ['atom', 'bomb', 'usa', 'drop', 'hiroshima']
Top 10 Matches for Keywords: ['atom', 'bomb', 'usa', 'drop', 'hiroshima']
Rank 1:
Match Percentage: 60.00%
Matched Keywords: ['atom', 'bomb', 'hiroshima']
Node A (ID: 4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2705)
 - Properties: ['id', 'name']
 - Values: ['atomic_bombs', 'atomic bombs']
Relationship (USED_IN)
 - Properties: []
 - Values: []
Node B (ID: 4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2703)
 - Properties: ['id', 'name', 'date']
 - Values: ['hiroshima_bombing', 'hiroshima bombing', '1945-08-06']
----------------------------------------
Rank 2:
Match Percentage: 60.00%
Matched Keywords: ['atom', 'bomb', 'hiroshima']
Node A (ID: 4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2703)
 - Properties: ['id', 'name', 'date']
 - Values: ['hiroshima_bombing', 'hiroshima bombing', '1945-08-06']
Relationship (USED_IN)
 - Properties: []
 - Values: []
Node B (ID: 4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2705)
 - Properties: ['id', 'name']
 - V

In [None]:
query="how many episodes are in chicago fire season 4"
# Example Usage
try:

    # Query the LLM with the extracted context
    matched_results, top_doc_id = create_keywords(query)
    print(f"*********")
    print(matched_results)
    print(f"!!!!!")
    answer = query_llm_with_context(llm, query, matched_results)
    print(f"*********\n Answer: {answer}")
finally:
    print("Done!!")

Extracted Keywords: ['episode', 'chicago fire', 'season 4']
Top 10 Matches for Keywords: ['episode', 'chicago fire', 'season 4']
Rank 1:
Match Percentage: 66.67%
Matched Keywords: ['chicago fire', 'season 4']
Node A (ID: 4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2524)
 - Properties: ['id', 'name', 'premiereDate', 'endDate', 'episodeCount']
 - Values: ['chicago_fire_season_4', 'chicago fire season 4', '2015-10-13', '2016-05-17', 23]
Relationship (FEATURES)
 - Properties: ['role']
 - Values: ['lieutenant']
Node B (ID: 4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2531)
 - Properties: ['id', 'name', 'role']
 - Values: ['kelly_severide', 'kelly severide', 'lieutenant']
----------------------------------------
Rank 2:
Match Percentage: 66.67%
Matched Keywords: ['chicago fire', 'season 4']
Node A (ID: 4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2531)
 - Properties: ['id', 'name', 'role']
 - Values: ['kelly_severide', 'kelly severide', 'lieutenant']
Relationship (FEATURES)
 - Properties: ['role']
 - Values:

In [None]:
def test_neo4j_connection(driver, database="neo4j"):
    with driver.session(database=database) as session:
        result = session.run("MATCH (n) RETURN n LIMIT 5")
        for record in result:
            print(record)

test_neo4j_connection(driver)

<Record n=<Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2515' labels=frozenset({'concept'}) properties={'name': 'minority interest', 'id': 'minority_interest'}>>
<Record n=<Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2516' labels=frozenset({'organization'}) properties={'name': 'parent corporation', 'id': 'parent_corporation'}>>
<Record n=<Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2517' labels=frozenset({'organization'}) properties={'name': 'subsidiary corporation', 'id': 'subsidiary_corporation'}>>
<Record n=<Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2518' labels=frozenset({'person'}) properties={'role': 'investor', 'id': 'investor'}>>
<Record n=<Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2519' labels=frozenset({'concept'}) properties={'name': 'associate company', 'id': 'associate_company'}>>


In [None]:
def get_graph_overview(driver, database="neo4j"):
    with driver.session(database=database) as session:
        node_labels = session.run("CALL db.labels()").value()
        rel_types = session.run("CALL db.relationshipTypes()").value()
        print(f"Node Labels: {node_labels}")
        print(f"Relationship Types: {rel_types}")

get_graph_overview(driver)


Node Labels: ['song', 'person', 'television_series', 'event', 'location', 'organization', 'agreement', 'law', 'country', 'legislation', 'document', 'character', 'work', 'album', 'concept', 'brand', 'organism', 'campaign', 'infrastructure', 'publication', 'war', 'process', 'product', 'case', 'title', 'guideline', 'other', 'appliance', 'amendment']
Relationship Types: ['RECORDED_BY', 'WRITTEN_BY', 'COMPOSED_BY', 'FEATURED_IN', 'PRODUCED_BY', 'PRODUCED', 'PART_OF', 'DIRECTED_BY', 'RELATED_TO', 'HAD_AFFAIR_WITH', 'MOTHER_OF', 'VISITED', 'RULED_BY', 'CAPITAL_OF', 'DIRECTED', 'RELEASED_BY', 'PERFORMED_BY', 'ACTED_IN', 'LEADS', 'MENTIONED_IN', 'ILLUSTRATED_BY', 'COMPOSED', 'OWNS', 'BROADCASTED_BY', 'INCLUDED_IN', 'MEMBER_OF', 'BORN_IN', 'PARENT_OF', 'WORKED_AT', 'CRITICIZED', 'TARGETED', 'INTRODUCED', 'WROTE', 'LOCATED_IN', 'INCLUDES', 'GIVEN_TO', 'ADMINISTERS', 'CONTRASTS_WITH', 'USES', 'CREATED_BY', 'BASED_ON', 'UTILIZES', 'CONTRIBUTED_TO', 'PROPOSED', 'CROSSES', 'PARTNERED_WITH', 'NAMED_AF

In [None]:
def debug_match_keywords(driver, keywords, database="neo4j"):
    query = """
    OPTIONAL MATCH (n)-[r]-(m)
    WHERE ANY(keyword IN $keywords WHERE 
        ANY(prop IN keys(n) WHERE 
            n[prop] IS NOT NULL AND toLower(toString(n[prop])) CONTAINS toLower(keyword)
        )
    )
    RETURN n, r, m
    LIMIT 10
    """
    with driver.session(database=database) as session:
        result = session.run(query, keywords=keywords)
        for record in result:
            print("Node A:", record["n"])
            print("Relationship:", record["r"])
            print("Node B:", record["m"])
            print("-" * 30)

debug_match_keywords(driver, ["Chicago Fire", "season", "episode"])


Node A: <Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2524' labels=frozenset({'television_series'}) properties={'premiereDate': '2015-10-13', 'endDate': '2016-05-17', 'episodeCount': 23, 'name': 'chicago fire season 4', 'id': 'chicago_fire_season_4'}>
Relationship: <Relationship element_id='5:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:1153305234164943324' nodes=(<Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2524' labels=frozenset({'television_series'}) properties={'premiereDate': '2015-10-13', 'endDate': '2016-05-17', 'episodeCount': 23, 'name': 'chicago fire season 4', 'id': 'chicago_fire_season_4'}>, <Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2531' labels=frozenset({'person'}) properties={'role': 'lieutenant', 'name': 'kelly severide', 'id': 'kelly_severide'}>) type='FEATURES' properties={'role': 'lieutenant'}>
Node B: <Node element_id='4:d0b1c407-34c5-4596-88bc-641d5b2c8ec2:2531' labels=frozenset({'person'}) properties={'role': 'lieutenant', 'name': 'ke