In [None]:
import pandas as pd
import json
from generate_graph import get_propositions, generateEdges, createGraph, get_propositions_nosplit
from refine_graph import refine
from query_graph import QueryGraph
from tqdm import tqdm
tqdm.pandas()

splits = {'train': 'squad_v2/train-00000-of-00001.parquet', 'validation': 'squad_v2/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/rajpurkar/squad_v2/" + splits["validation"])

## Datasets

### SQuAD

In [45]:
splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'validation': 'plain_text/validation-00000-of-00001.parquet'}
df_squad = pd.read_parquet("hf://datasets/rajpurkar/squad/" + splits["train"])

### SQuAD V2

In [None]:
splits = {'train': 'squad_v2/train-00000-of-00001.parquet', 'validation': 'squad_v2/validation-00000-of-00001.parquet'}
df_squadv2 = pd.read_parquet("hf://datasets/rajpurkar/squad_v2/" + splits["validation"])

## Experiments

To run the evaluation script

```
python evaluate-v2.0.py <path_to_dev-v2.0> <path_to_predictions>
```

Example:

```
python evaluate-v2.0.py <dev-v2.0.json> <predictions.json>

python evalscripts/SQuADv2/evaluate-v2.0.py evalscripts/SQuADv2/dev-v2.0.json evalscripts/SQuADv2/predictions.json

```

### Initial Experiment

In [None]:
df.info()

In [3]:
context_list = df['context'].unique()

In [None]:
len(context_list)

In [None]:
context_list[0:1]

In [None]:
propositions = []

for context in enumerate(tqdm(context_list[0:1])):
    get_propositions(context, propositions)

In [None]:
propositions = []

for context in enumerate(tqdm(context_list)):
    get_propositions(context, propositions)

In [None]:
# Open the file in read mode
with open('propositionscount.txt', 'r') as file:
    # Read all lines and store them in a list
    propositions_from_file = [line.strip() for line in file]

print(propositions_from_file)

In [None]:
len(propositions_from_file)

2500 props takes 97 mins

In [None]:
# 0-2500
# Done
list_of_edges = generateEdges(propositions_from_file[0:2500])

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
# 2500-5000
# Done
list_of_edges = generateEdges(propositions_from_file[2500:5000])

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
# 5000-7500
# Done
list_of_edges = generateEdges(propositions_from_file[5000:7500])

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
# 7500-10000
# done
list_of_edges = generateEdges(propositions_from_file[7500:10000])

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
# 10000-12500
# TO FOLLOW
list_of_edges = generateEdges(propositions_from_file[10000:12500])

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
# 12500-15000
# TO FOLLOW
list_of_edges = generateEdges(propositions_from_file[12500:15000])

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
# 15000-17500
# TO FOLLOW
list_of_edges = generateEdges(propositions_from_file[15000:17500])

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
# 17500-20000
# TO FOLLOW
list_of_edges = generateEdges(propositions_from_file[17500:20000])

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
# 20000-22420
# TO FOLLOW
list_of_edges = generateEdges(propositions_from_file[20000:22420])

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [15]:
qg = QueryGraph()
my_dict = {}

# Define a function to apply to each row
def print_qa(row):
    
    question = row['question']
    questionid = row['id']
    req = qg.get_requirements(question)
    result = qg.answer_question(question, req.content)
    
    
    if result is not None:
    
        model_answer = result['result']

        if "don't know the answer" in model_answer:
            model_answer = ""
    
        if len(row['answers']['text']) > 0:
            real_answer = row['answers']['text'][0]
        else:
            real_answer = "\"\""

        if len(result['intermediate_steps']) > 0:
            cypher_query = result['intermediate_steps'][0]['query']
            context = result['intermediate_steps'][1]['context']
       
            if(len(context) < 1):
            #     result = refine_query(cypher_query[6:], question)
            #     model_answer = result['result'] 
                model_answer = "\"\""
                my_dict.update({questionid: ""})
            else:
                my_dict.update({questionid: model_answer})
                
        with open("preds2.txt", "a") as preds:
            preds.write("question: " + question + "\n")
            preds.write("real_answer: " + real_answer + "\n")
            preds.write("model_answer: " + model_answer + "\n")  
            preds.write("cypher_query: " + cypher_query + "\n")
            preds.write("======================" + "\n")  
        preds.close()

    else: 
        my_dict.update({questionid: ""})
        

In [None]:
my_dict = {}
# Apply the function to each row
df_squadv2[0:2500].progress_apply(print_qa, axis=1)
    
with open("predictions1.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
my_dict = {}

# Apply the function to each row
df_squadv2[2500:5000].progress_apply(print_qa, axis=1)
    
with open("predictions2.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
my_dict = {}

# Apply the function to each row
df_squadv2[5000:7500].progress_apply(print_qa, axis=1)
    
with open("predictions3.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
my_dict = {}

# Apply the function to each row
df_squadv2[7500:10000].progress_apply(print_qa, axis=1)
    
with open("predictions4.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
my_dict = {}

# Apply the function to each row
df_squadv2[10000:11873].progress_apply(print_qa, axis=1)
    
with open("predictions5.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
# Load the first JSON file
with open('predictions1.json') as f:
    data1 = json.load(f)

# Load the second JSON file
with open('predictions2.json') as f:
    data2 = json.load(f)

# Load the third JSON file
with open('predictions3.json') as f:
    data3 = json.load(f)

# Load the fourth JSON file
with open('predictions4.json') as f:
    data4 = json.load(f)
    
# Load the fifth JSON file
with open('predictions5.json') as f:
    data5 = json.load(f)
    
# Merge the two JSON objects (assuming they are dictionaries)
merged_data = {**data1, **data2, **data3, **data4, **data5}

# Save the merged JSON to a new file
with open('merged.json', 'w') as f:
    json.dump(merged_data, f, indent=4)

### Second Experiment

In [None]:
# 17500-20000
# TO FOLLOW
list_of_edges = generateEdges(propositions_from_file[0:500])

In [None]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

In [None]:
my_dict = {}

# Apply the function to each row
df_squadv2[11:20].progress_apply(print_qa, axis=1)
    
with open("squadv2preds2.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

## Third Experiment

In [8]:
qg = QueryGraph()
my_dict = {}

# Define a function to apply to each row
def print_qa(row):
    
    question = row['question']
    questionid = row['id']
    req = qg.get_requirements(question)
    result = qg.answer_question(question, req.content)
    
    if result is not None:
    
        model_answer = result['result']

        if "don't know the answer" in model_answer:
            model_answer = ""
    
        if len(row['answers']['text']) > 0:
            real_answer = row['answers']['text'][0]
        else:
            real_answer = "\"\""

        if len(result['intermediate_steps']) > 0:
            cypher_query = result['intermediate_steps'][0]['query']
            context = result['intermediate_steps'][1]['context']
       
            if(len(context) < 1):
            #     result = refine_query(cypher_query[6:], question)
            #     model_answer = result['result'] 
                model_answer = "\"\""
                my_dict.update({questionid: ""})
            else:
                my_dict.update({questionid: model_answer})
                
        # with open("squadpreds5.txt", "a") as preds:
        #     preds.write("question: " + question + "\n")
        #     preds.write("real_answer: " + real_answer + "\n")
        #     preds.write("model_answer: " + model_answer + "\n")  
        #     preds.write("cypher_query: " + cypher_query + "\n")
        #     preds.write("======================" + "\n")  
        # preds.close()

    else: 
        my_dict.update({questionid: ""})
        

In [None]:
my_dict = {}
# Apply the function to each row
df[0:2500].progress_apply(print_qa, axis=1)
    
with open("predictions1.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
my_dict = {}
# Apply the function to each row
df[0:2500].progress_apply(print_qa, axis=1)
    
with open("predictions2.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
my_dict = {}
# Apply the function to each row
df[2500:5000].progress_apply(print_qa, axis=1)
    
with open("predictions3.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
my_dict = {}
# Apply the function to each row
df[5000:7500].progress_apply(print_qa, axis=1)
    
with open("predictions4.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
my_dict = {}
# Apply the function to each row
df[7500:10000].progress_apply(print_qa, axis=1)
    
with open("predictions5.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
my_dict = {}
# Apply the function to each row
df[7500:11873].progress_apply(print_qa, axis=1)
    
with open("predictions1.json", "a") as json_file:
    json.dump(my_dict, json_file, indent=4)

In [None]:
# x = df.iloc[11]
# question = x['question']
# print("question:", question)
# answer = x['answers']['text']
# print("answer:", answer)

# qg = QueryGraph()
# req = qg.get_requirements(question)
# res = qg.answer_question(question, req.content)
# res