In [1]:
import pandas as pd
import json
from generate_graph import get_propositions, generateEdges, createGraph, get_propositions_nosplit
from refine_graph import refine
from query_graph import QueryGraph
from tqdm import tqdm
tqdm.pandas()

df = pd.read_json('datasets/HotPotQA/hotpot_dev_distractor_v1.json')

## Experiment


### Graph creation and refinement section

To run the evaluation script

```
python hotpot_evaluate_v1.py <path_to_prediction> <path_to_gold>
```

Example:

```
python evalscripts/HotPotQA/hotpot_evaluate_v1.py evalscripts/HotPotQA/sample_dev_pred.json evalscripts/HotPotQA/hotpot_dev_distractor_v1.json
```

In [2]:
context_list = df['context']

In [None]:
df.head(31)

In [None]:
df.iloc[7]['context']

In [3]:
context_list[21:31]

21    [[Hip Hip Pooh-Ray!, ["Hip Hip Pooh-Ray!",  is...
22    [[Universal remote, [A universal remote is a r...
23    [[Wolf Alice, [Wolf Alice are a four-piece alt...
24    [[Gus Williams (musician), [Kasper Gus Ntjalka...
25    [[Nancy Sinatra, [Nancy Sandra Sinatra (born J...
26    [[2012–13 VCU Rams men's basketball team, [The...
27    [[Dictyosperma, [Dictyosperma is a monotypic g...
28    [[Edgar Kaiser Jr., [Edgar Fosburgh Kaiser Jr....
29    [[St. Andre (Goa Assembly constituency), [St. ...
30    [[All I Wanna Do Is Make Love to You, ["All I ...
Name: context, dtype: object

In [None]:
propositions = []

for i, value in enumerate(tqdm(context_list[21:31])):
        for j, context in enumerate(value):
                index = 1
                if index < len(context):
                    words = context[1]
                    combined = " ".join(words)
                    get_propositions_nosplit(combined, propositions)

In [None]:
# Open the file in read mode
with open('propositions.txt', 'r') as file:
    # Read all lines and store them in a list
    propositions_from_file = [line.strip() for line in file]

print(propositions_from_file)

In [None]:
list_of_edges = generateEdges(propositions_from_file)

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
qg = QueryGraph()

if refine(qg, "person"):
    print("yes")
else:
    print("no")

### Q and A section

In [None]:
# def find_supporting_facts(question, answer, context):
#     supporting_facts = []
#     for passage in context:
#         title, sentences = passage
#         for i, sentence in enumerate(sentences):
#             if answer in sentence or any(word in sentence for word in question.split()):
#                 supporting_facts.append([title, i])
#     return supporting_facts

In [2]:
qg = QueryGraph()
my_dict = {}

# Define a function to apply to each row
def print_qa(row):
    
    question = row['question']
    questionid = row['_id']
    req = qg.get_requirements(question)
    result = qg.answer_question(question, req.content)

    if result is not None:
    
        model_answer = result['result']

        if "don't know the answer" in model_answer:
            model_answer = ""
    
        if len(row['answer']) > 0:
            real_answer = row['answer']
        else:
            real_answer = "\"\""

        if len(result['intermediate_steps']) > 0:
            cypher_query = result['intermediate_steps'][0]['query']
            context = result['intermediate_steps'][1]['context']
       
            if(len(context) < 1):
            #     result = refine_query(cypher_query[6:], question)
            #     model_answer = result['result'] 
                model_answer = "\"\""
                my_dict.update({questionid: ""})
            else:
                my_dict.update({questionid: model_answer})
                
        with open("preds.txt", "a") as preds:
            preds.write("question: " + question + "\n")
            preds.write("real_answer: " + real_answer + "\n")
            preds.write("model_answer: " + model_answer + "\n")  
            preds.write("cypher_query: " + cypher_query + "\n")
            preds.write("======================" + "\n")  
        preds.close()

    else: 
        my_dict.update({questionid: ""})
        

In [None]:
my_dict = {}
# Apply the function to each row
df[0:31].progress_apply(print_qa, axis=1)
    
with open("dev_fullwiki_pred.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
context_list[18]

In [None]:
x = df.iloc[31]
question = x['question']
print("question:", question)
answer = x['answer']
print("answer:", answer)

qg = QueryGraph()
req = qg.get_requirements(question)
res = qg.answer_question(question, req.content)
res

## Graph refinement 

In [4]:
qg = QueryGraph()

QUERY = """
MATCH (N) RETURN N
"""
res = qg._graph.query(QUERY)
res

KeyboardInterrupt: 

In [None]:
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings

In [None]:
neo4j_url = "bolt://localhost:7687"
neo4j_user = "neo4j"
neo4j_password = "12345678"

Create embeddings

In [None]:
# # Now we initialize from existing graph
# existing_graph = Neo4jVector.from_existing_graph(
#     embedding=OpenAIEmbeddings(model='text-embedding-3-small'),
#     url=neo4j_url,
#     username=neo4j_user,
#     password=neo4j_password,
#     index_name="person_index",
#     node_label="Person",
#     text_node_properties=["name"],
#     embedding_node_property="embedding",
# )

Refactor node labels

In [None]:
# # cq = """
# # MATCH (en:Entity)
# # WHERE en.label = 'Person'
# # WITH collect(en) AS persons
# # CALL apoc.refactor.rename.label("Entity", "Person", persons)
# # YIELD batches, total, timeTaken, committedOperations
# # RETURN batches, total, timeTaken, committedOperations;
# # """
# # res = qg._graph.query(cq)

Create projection

In [None]:
cq = """
MATCH (p:Person)
RETURN gds.graph.project(
  'personproj',
  p,
  null,
  {
    sourceNodeProperties: p { .embedding },
    targetNodeProperties: {}
  }
)
"""

res = qg._graph.query(cq)

Use KNN to measure node similarities

In [None]:
cq = """
CALL gds.knn.stream('projection2', {
topK: 1,
nodeProperties: ['embedding'],
// The following parameters are set to produce a deterministic result
randomSeed: 1337,
concurrency: 1,
sampleRate: 1.0,
deltaThreshold: 0.0
})
YIELD node1, node2, similarity
WHERE similarity > .90
RETURN gds.util.asNode(node1).name AS Person1, gds.util.asNode(node2).name AS Person2, similarity
ORDER BY similarity DESCENDING, Person1, Person2
"""

res = qg._graph.query(cq)
res

In [None]:
for i in res:
    for key, value in i.items():
        print(f"{key}: {value}")

In [None]:
def are_dictionaries_equivalent(d1, d2):
    # Check if both have the same keys
    if d1.keys() != d2.keys():
        return False

    # Check if Person1 and Person2 are swapped but equivalent
    if (d1['Person1'] == d2['Person2'] and d1['Person2'] == d2['Person1'] and d1['similarity'] == d2['similarity']):
        return True
    
    return False


In [None]:
newlist = res

In [None]:
for idx, val in enumerate(res):   
    for idx2, val2 in enumerate(newlist):
        if val != val2:
            if are_dictionaries_equivalent(val, val2):
                newlist.pop(idx2)


In [None]:
unique = []

for value in newlist:
    if value not in unique:
        unique.append(value)

unique

In [None]:
node_name = "Shirley Temple"

query = f"""
MATCH (n:Person {{name: '{node_name}'}})
RETURN n
"""
print(query)

res = qg._graph.query(query)
res


Merge similar nodes (Entity resolution)

In [None]:
for idx, val in enumerate(unique):
    print(val)
    p1 = val['Person1']
    p2 = val['Person2']    
    cq = f"""
    MATCH (a1:Person {{name: '{p1}'}}), (a2:Person {{name: '{p2}'}})
    WITH head(collect([a1,a2])) as nodes
    CALL apoc.refactor.mergeNodes(nodes,{{
    properties:"discard",
    mergeRels:true
    }})
    YIELD node
    RETURN node;
    """
    res = qg._graph.query(cq)
    res