In [1]:
import pandas as pd
import json
from generate_graph import get_propositions, generateEdges, createGraph, get_propositions_nosplit
from refine_graph import refine
from query_graph import QueryGraph
from tqdm import tqdm
tqdm.pandas()

df = pd.read_json('datasets/HotPotQA/hotpot_dev_distractor_v1.json')

## Experiment


### Graph creation and refinement section

To run the evaluation script

```
python hotpot_evaluate_v1.py <path_to_prediction> <path_to_gold>
```

Example:

```
python hotpot_evaluate_v1.py <sample_dev_pred.json> <hotpot_dev_fullwiki_v1.json>
```

In [2]:
context_list = df['context']

In [None]:
df.head(31)

In [None]:
df.iloc[7]['context']

In [None]:
context_list[21:31]

In [None]:
propositions = []

for i, value in enumerate(tqdm(context_list[21:31])):
        for j, context in enumerate(value):
                index = 1
                if index < len(context):
                    words = context[1]
                    combined = " ".join(words)
                    get_propositions_nosplit(combined, propositions)

In [None]:
# Open the file in read mode
with open('propositions.txt', 'r') as file:
    # Read all lines and store them in a list
    propositions_from_file = [line.strip() for line in file]

print(propositions_from_file)

In [None]:
list_of_edges = generateEdges(propositions_from_file)

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
qg = QueryGraph()

if refine(qg, "person"):
    print("yes")
else:
    print("no")

### Q and A section

In [None]:
# def find_supporting_facts(question, answer, context):
#     supporting_facts = []
#     for passage in context:
#         title, sentences = passage
#         for i, sentence in enumerate(sentences):
#             if answer in sentence or any(word in sentence for word in question.split()):
#                 supporting_facts.append([title, i])
#     return supporting_facts

In [2]:
qg = QueryGraph()
my_dict = {}

# Define a function to apply to each row
def print_qa(row):
    
    question = row['question']
    questionid = row['_id']
    req = qg.get_requirements(question)
    result = qg.answer_question(question, req.content)

    if result is not None:
    
        model_answer = result['result']

        if "don't know the answer" in model_answer:
            model_answer = ""
    
        if len(row['answer']) > 0:
            real_answer = row['answer']
        else:
            real_answer = "\"\""

        if len(result['intermediate_steps']) > 0:
            cypher_query = result['intermediate_steps'][0]['query']
            context = result['intermediate_steps'][1]['context']
       
            if(len(context) < 1):
            #     result = refine_query(cypher_query[6:], question)
            #     model_answer = result['result'] 
                model_answer = "\"\""
                my_dict.update({questionid: ""})
            else:
                my_dict.update({questionid: model_answer})
                
        with open("preds.txt", "a") as preds:
            preds.write("question: " + question + "\n")
            preds.write("real_answer: " + real_answer + "\n")
            preds.write("model_answer: " + model_answer + "\n")  
            preds.write("cypher_query: " + cypher_query + "\n")
            preds.write("======================" + "\n")  
        preds.close()

    else: 
        my_dict.update({questionid: ""})
        

In [3]:
my_dict = {}
# Apply the function to each row
df[21:31].progress_apply(print_qa, axis=1)
    
with open("dev_fullwiki_pred.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

  0%|          | 0/10 [00:00<?, ?it/s]



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (w)-[r1]-(x)-[r2]-(y)
WHERE (
    toLower(r1.metadata) =~ '.*\\b(a rather blustery day)\\b.*' OR
    toLower(r1.description) =~ '.*\\b(singer)\\b.*'
)
AND (
    toLower(r2.metadata) =~ '.*\\b(hedgehog)\\b.*' OR 
    toLower(r2.description) =~ '.*\\b(voiced)\\b.*'
)
RETURN DISTINCT r1.metadata, r1.description, r2.metadata, r2.description
[0m
Full Context:
[32;1m[1;3m[{'r1.metadata': '{"summary": "Catherine Davis \'Kate\' Higgins is a singer.", "generated_at": "2024-12-23 14:35:08.719204"}', 'r1.description': "Catherine Davis 'Kate' Higgins is identified as a singer.", 'r2.metadata': '{"summary": "Catherine Davis \'Kate\' Higgins has voiced C.C. in \'Code Geass\'.", "generated_at": "2024-12-23 14:35:08.719204"}', 'r2.description': "Catherine Davis 'Kate' Higgins has voiced the character C.C. in the series 'Code Geass'."}, {'r1.metadata': '{"summary": "Catherine Davis \'Kate\' Higgins is a 

 20%|██        | 2/10 [00:05<00:23,  2.97s/it]


[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (w)-[r1]-(x)-[r2]-(y)
WHERE (
    toLower(r1.metadata) =~ '.*\\b(apple remote)\\b.*' OR
    toLower(r1.description) =~ '.*\\b(apple remote)\\b.*'
)
AND (
    toLower(r2.metadata) =~ '.*\\b(control|device|program)\\b.*' OR 
    toLower(r2.description) =~ '.*\\b(control|device|program)\\b.*'
)
RETURN DISTINCT r1.metadata, r1.description, r2.metadata, r2.description
[0m
Full Context:
[32;1m[1;3m[{'r1.metadata': '{"summary": "The Apple Remote was released in or after October 2005.", "generated_at": "2024-12-23 14:35:08.719204"}', 'r1.description': 'The Apple Remote is associated with the event of its release, which occurred in or after October 2005.', 'r2.metadata': '{"summary": "The Apple Remote is a remote control device.", "generated_at": "2024-12-23 14:35:08.719204"}', 'r2.description': 'Apple Remote is a type of remote control device.'}, {'r1.metadata': '{"sum

 30%|███       | 3/10 [00:15<00:41,  5.87s/it]


[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m


 40%|████      | 4/10 [00:20<00:33,  5.59s/it]

Generated Cypher:
[32;1m[1;3mcypher
MATCH ()-[r1]-()
WHERE (
    toLower(r1.metadata) =~ '.*\\b(badly drawn boy)\\b.*' OR
    toLower(r1.description) =~ '.*\\b(badly drawn boy)\\b.*'
)
RETURN DISTINCT r1.metadata AS info1, 
       COUNT(DISTINCT r1.instrument) AS instruments_bdb, 
       COUNT(DISTINCT r1.performer) AS performers_bdb
UNION ALL
MATCH ()-[r2]-()
WHERE (
    toLower(r2.metadata) =~ '.*\\b(wolf alice)\\b.*' OR 
    toLower(r2.description) =~ '.*\\b(wolf alice)\\b.*'
)
RETURN DISTINCT r2.metadata AS info1, 
       COUNT(DISTINCT r2.instrument) AS instruments_wa, 
       COUNT(DISTINCT r2.performer) AS performers_wa
[0m
Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: All sub queries in an UNION must have the same return column names (line 10, column 1 (offset: 307))
"UNION ALL"
 ^}


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (w)-[r1]-(x)-[r2]-(y)
WHERE (
    toLower(r1.met

 50%|█████     | 5/10 [00:27<00:29,  5.96s/it]


[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m


 60%|██████    | 6/10 [00:34<00:24,  6.12s/it]

Generated Cypher:
[32;1m[1;3mcypher
MATCH (w)-[r1]-(x)
WHERE (
    toLower(r1.metadata) =~ '.*\\b(these boots are made for walkin\')\\b.*' OR
    toLower(r1.description) =~ '.*\\b(writer)\\b.*'
)
RETURN DISTINCT r1.metadata AS writer
UNION ALL
MATCH (y)-[r2]-(z)
WHERE (
    toLower(r2.metadata) =~ '.*\\b(died)\\b.*' AND
    toLower(r2.metadata) =~ '.*\\b(2007)\\b.*' OR
    toLower(r2.description) =~ '.*\\b(died)\\b.*' AND
    toLower(r2.description) =~ '.*\\b(2007)\\b.*'
)
RETURN DISTINCT r2.metadata AS deceased
[0m
Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: All sub queries in an UNION must have the same return column names (line 8, column 1 (offset: 205))
"UNION ALL"
 ^}


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (w)-[r1]-(x)-[r2]-(y)
WHERE (
    toLower(r1.metadata) =~ '.*\\b(2011–12 vcu rams men\'s basketball team)\\b.*' OR
    toLower(r1.description) =~ '.*\\b(shaka smart)\

 70%|███████   | 7/10 [00:39<00:17,  5.83s/it]


[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH ()-[r1]-()
WHERE (
    toLower(r1.metadata) =~ '.*\\b(dictyosperma)\\b.*' OR
    toLower(r1.description) =~ '.*\\b(dictyosperma)\\b.*'
)
RETURN DISTINCT r1.metadata AS info1
UNION ALL
MATCH ()-[r2]-()
WHERE (
    toLower(r2.metadata) =~ '.*\\b(huernia)\\b.*' OR 
    toLower(r2.description) =~ '.*\\b(huernia)\\b.*'
)
RETURN DISTINCT r2.metadata AS info1
[0m
Full Context:
[32;1m[1;3m[{'info1': '{"summary": "Dictyosperma is a monotypic genus of flowering plant.", "generated_at": "2024-12-23 14:35:08.719204"}'}, {'info1': '{"summary": "Dictyosperma belongs to the palm family.", "generated_at": "2024-12-23 14:35:08.719204"}'}, {'info1': '{"summary": "Dictyosperma is found in the Mascarene Islands in the Indian Ocean.", "generated_at": "2024-12-23 14:35:08.719204"}'}, {'info1': '{"summary": "The sole species of Dictyosperma is Dictyosperma album.", "generated_at": "2

 80%|████████  | 8/10 [00:45<00:11,  5.97s/it]


[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (w)-[r1]-(x)-[r2]-(y)
WHERE (
    toLower(r1.metadata) =~ '.*\\b(kaiser ventures corporation)\\b.*' OR
    toLower(r1.description) =~ '.*\\b(kaiser ventures corporation)\\b.*'
)
AND (
    toLower(r2.metadata) =~ '.*\\b(american industrialist|father of modern american shipbuilding)\\b.*' OR 
    toLower(r2.description) =~ '.*\\b(american industrialist|father of modern american shipbuilding)\\b.*'
)
RETURN DISTINCT r1.metadata, r1.description, r2.metadata, r2.description
[0m
Full Context:
[32;1m[1;3m[][0m


 90%|█████████ | 9/10 [00:51<00:05,  5.95s/it]


[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (w)-[r1]-(x)-[r2]-(y)
WHERE (
    toLower(r1.metadata) =~ '.*\\b(tunnels and trolls)\\b.*' OR
    toLower(r1.description) =~ '.*\\b(tunnels and trolls)\\b.*'
)
AND (
    toLower(r2.metadata) =~ '.*\\b(ken st. andre)\\b.*' OR 
    toLower(r2.description) =~ '.*\\b(ken st. andre)\\b.*'
)
RETURN DISTINCT r1.metadata, r1.description, r2.metadata, r2.description
[0m
Full Context:
[32;1m[1;3m[][0m


100%|██████████| 10/10 [00:56<00:00,  5.69s/it]


[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (w)-[r1]-(x)-[r2]-(y)
WHERE (
    toLower(r1.metadata) =~ '.*\\b(poison)\\b.*' OR
    toLower(r1.description) =~ '.*\\b(poison)\\b.*'
)
AND (
    toLower(r2.metadata) =~ '.*\\b(shut up, make love)\\b.*' OR 
    toLower(r2.description) =~ '.*\\b(shut up, make love)\\b.*'
)
RETURN DISTINCT r1.metadata, r1.description, r2.metadata, r2.description[0m
Full Context:
[32;1m[1;3m[{'r1.metadata': '{"summary": "\\"Shut Up, Make Love\\" is a song by the American rock band Poison.", "generated_at": "2024-12-23 14:35:08.719204"}', 'r1.description': "The song 'Shut Up, Make Love' is created by the American rock band Poison.", 'r2.metadata': '{"summary": "The first b-side included in the single/sampler is the first single from the album Shut Up, Make Love.", "generated_at": "2024-12-23 14:35:08.719204"}', 'r2.description': 'The album is titled Shut Up, Make Love.'}, {'r1.metadata': 

100%|██████████| 10/10 [01:04<00:00,  6.41s/it]


[1m> Finished chain.[0m





In [None]:
context_list[18]

In [None]:
x = df.iloc[31]
question = x['question']
print("question:", question)
answer = x['answer']
print("answer:", answer)

qg = QueryGraph()
req = qg.get_requirements(question)
res = qg.answer_question(question, req.content)
res

## Graph refinement 

In [None]:
qg = QueryGraph()

QUERY = """
MATCH (N) RETURN N
"""
res = qg._graph.query(QUERY)
res

In [None]:
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings

In [None]:
neo4j_url = "bolt://localhost:7687"
neo4j_user = "neo4j"
neo4j_password = "12345678"

Create embeddings

In [None]:
# # Now we initialize from existing graph
# existing_graph = Neo4jVector.from_existing_graph(
#     embedding=OpenAIEmbeddings(model='text-embedding-3-small'),
#     url=neo4j_url,
#     username=neo4j_user,
#     password=neo4j_password,
#     index_name="person_index",
#     node_label="Person",
#     text_node_properties=["name"],
#     embedding_node_property="embedding",
# )

Refactor node labels

In [None]:
# # cq = """
# # MATCH (en:Entity)
# # WHERE en.label = 'Person'
# # WITH collect(en) AS persons
# # CALL apoc.refactor.rename.label("Entity", "Person", persons)
# # YIELD batches, total, timeTaken, committedOperations
# # RETURN batches, total, timeTaken, committedOperations;
# # """
# # res = qg._graph.query(cq)

Create projection

In [None]:
cq = """
MATCH (p:Person)
RETURN gds.graph.project(
  'personproj',
  p,
  null,
  {
    sourceNodeProperties: p { .embedding },
    targetNodeProperties: {}
  }
)
"""

res = qg._graph.query(cq)

Use KNN to measure node similarities

In [None]:
cq = """
CALL gds.knn.stream('projection2', {
topK: 1,
nodeProperties: ['embedding'],
// The following parameters are set to produce a deterministic result
randomSeed: 1337,
concurrency: 1,
sampleRate: 1.0,
deltaThreshold: 0.0
})
YIELD node1, node2, similarity
WHERE similarity > .90
RETURN gds.util.asNode(node1).name AS Person1, gds.util.asNode(node2).name AS Person2, similarity
ORDER BY similarity DESCENDING, Person1, Person2
"""

res = qg._graph.query(cq)
res

In [None]:
for i in res:
    for key, value in i.items():
        print(f"{key}: {value}")

In [None]:
def are_dictionaries_equivalent(d1, d2):
    # Check if both have the same keys
    if d1.keys() != d2.keys():
        return False

    # Check if Person1 and Person2 are swapped but equivalent
    if (d1['Person1'] == d2['Person2'] and d1['Person2'] == d2['Person1'] and d1['similarity'] == d2['similarity']):
        return True
    
    return False


In [None]:
newlist = res

In [None]:
for idx, val in enumerate(res):   
    for idx2, val2 in enumerate(newlist):
        if val != val2:
            if are_dictionaries_equivalent(val, val2):
                newlist.pop(idx2)


In [None]:
unique = []

for value in newlist:
    if value not in unique:
        unique.append(value)

unique

In [None]:
node_name = "Shirley Temple"

query = f"""
MATCH (n:Person {{name: '{node_name}'}})
RETURN n
"""
print(query)

res = qg._graph.query(query)
res


Merge similar nodes (Entity resolution)

In [None]:
for idx, val in enumerate(unique):
    print(val)
    p1 = val['Person1']
    p2 = val['Person2']    
    cq = f"""
    MATCH (a1:Person {{name: '{p1}'}}), (a2:Person {{name: '{p2}'}})
    WITH head(collect([a1,a2])) as nodes
    CALL apoc.refactor.mergeNodes(nodes,{{
    properties:"discard",
    mergeRels:true
    }})
    YIELD node
    RETURN node;
    """
    res = qg._graph.query(cq)
    res