In [1]:
# load the original qrecc test set with human answers
import json

path = '../dataset/qrecc-test.json'

with open(path) as f: 
    data = json.load(f)

In [2]:
# load evidence spans and passage ids for human answers
import pandas as pd
from collections import defaultdict

df = pd.read_csv('../annotations/human_evidence.csv', sep=';')
print(len(df), 'grounding sentences')

evidence = defaultdict(dict)
for index, row in df.iterrows():
    question_id = row['id']
    if row['pid'] not in evidence[question_id]:
        evidence[question_id][row['pid']] = []
    if row['predicted'] not in evidence[question_id][row['pid']]:
        evidence[question_id][row['pid']].append(row['predicted'])
print(len(evidence), 'questions grounded')

13352 grounding sentences
6991 questions grounded


In [3]:
# load (un)faithful generated answers and plausible extracted spans
correct_answers = defaultdict(list)
df = pd.read_csv('../annotations/faithful_answers.csv')
print(len(df), 'answers')
for index, row in df.iterrows():
    answer = row['Input.answer']
    if answer not in correct_answers[row['Input.id']]:
        correct_answers[row['Input.id']].append(answer)
        
df = pd.read_csv('../annotations/plausible_spans.csv')
print(len(df), 'answers')
for index, row in df.iterrows():
    answer = row['Input.predicted']
    if answer not in correct_answers[row['Input.id']]:
        correct_answers[row['Input.id']].append(answer)
        
incorrect_answers = defaultdict(list)
df = pd.read_csv('../annotations/unfaithful_answers.csv')
print(len(df), 'answers')
for index, row in df.iterrows():
    answer = row['Input.answer']
    if answer not in incorrect_answers[row['Input.id']]:
        incorrect_answers[row['Input.id']].append(answer)

print(len(correct_answers), 'questions with correct answers')
print(len(incorrect_answers), 'questions with incorrect answers')

386 answers
192 answers
21 answers
156 questions with correct answers
20 questions with incorrect answers


In [4]:
# load evidence spans and passage ids for generated answers
df = pd.read_csv('../annotations/extracted_answers.csv')
print(len(df), 'grounding sentences')

# evidence = defaultdict(dict)
for index, row in df.iterrows():
    question_id = row['id']
    if row['pid'] not in evidence[question_id]:
        evidence[question_id][row['pid']] = []
    if row['predicted'] not in evidence[question_id][row['pid']]:
        evidence[question_id][row['pid']].append(row['predicted'])
print(len(evidence), 'questions grounded')

242 grounding sentences
7044 questions grounded


In [5]:
evidence["3_6"]

{'http://web.archive.org/web/20200523205052id_/https://www.bhf.org.uk/informationsupport/heart-matters-magazine/medical/ask-the-experts/atrial-flutter_p9': ['In atrial fibrillation, the atria beat irregularly. In atrial flutter, the atria beat regularly, but faster than usual and more often than the ventricles, so you may have four atrial beats to every',
  'Atrial flutter is less common than atrial fibrillation Atrial flutter is less common, but has similar symptoms (feeling faint, tiredness, palpitations, shortness of breath or dizziness).']}

In [6]:
import json

scai_qrecc = []
for sample in data:
    question_id = "%s_%s" % (sample["Conversation_no"], sample["Turn_no"])
    rewrite = sample['Rewrite']
    human_answer = sample['Answer']
    if correct_answers[question_id] or evidence[question_id]:
        sample = {'ID': question_id,
                  'Question': sample['Rewrite'],
                  'Answers': [sample['Answer']] + correct_answers[question_id],
                  'Evidence': evidence[question_id]
                 }
        scai_qrecc.append(sample)
#         break

with open('../dataset/scai_qrecc.json', 'w', encoding='utf-8') as f:
    json.dump(scai_qrecc, f, ensure_ascii=False, indent=4)