In [31]:
# load plausibility annotations
# download file from https://zenodo.org/record/5752542#.Yau-Li8RppQ
import pandas as pd

df = pd.read_csv('../annotations/answer_plausibility_annotations_original.csv')
# print(df.head())
print(len(df), 'rows')

5507 rows


In [32]:
# remove all rows where Approve=="random answer" ie classified as invalid judgements
# which means the worker's answer was rejected by the quality control evaluator
df = df[df.Approve != "random answer"]
print(len(df), 'rows')

5387 rows


In [33]:
# remove rows with the answers that were not judged as "correct answer" by the quality control evaluator
remove_answers = set(df[df.Approve == "correct answer"]["Input.predicted"])
# filter out by those ids and not marked as correct
df = df[~((df['Input.predicted'].isin(remove_answers)) & (df.Approve!="correct answer"))]
print(len(df), 'rows')

5317 rows


In [34]:
# remove all samples where at least one of the rows was marked as "confusing answer"
# which means that the sample is hard to decide upon as judged by the quality control evaluator
remove_answers = df[df.Approve == "confusing answer"]["Input.predicted"]
df = df[~df['Input.predicted'].isin(remove_answers)]
print(len(df), 'rows')

5015 rows


In [35]:
# replace the radio button results with a single column
df.loc[df['Answer.answer.on'] == True, 'Annotation'] = 'Plausible'
df.loc[df['Answer.noanswer.on'] == True, 'Annotation'] = 'Implausible'
df.loc[df['Answer.malformed.on'] == True, 'Annotation'] = 'Malformed'
# drop old columns
df = df.drop(['Answer.answer.on', 'Answer.noanswer.on', 'Answer.malformed.on', 'Reject'], axis = 1)
df

Unnamed: 0,WorkTimeInSeconds,Input.id,Input.question,Input.predicted,Input.model,Answer.comments,Approve,Annotation
0,18,1_1,What is a physician's assistant?,A physician assistant is a person who has succ...,ultron-scai-qrecc21-test-dataset-rewritten-202...,NICE,,Plausible
1,20,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-rewritten-20...,,,Plausible
2,83,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible
3,31,1_1,What is a physician's assistant?,A physician's assistant is a person who assist...,gpt3-scai-qrecc21-test-dataset-2021-07-20-scai...,It is very interesting to the survey.,,Plausible
4,56,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-rewritten-20...,It's really interesting,,Plausible
...,...,...,...,...,...,...,...,...
5502,19,99_5,Was Glamazon a nickname for Beth Phoenix?,glamazon is so-named because of beth phoenix's...,ultron-scai-qrecc21-test-dataset-rewritten-202...,,,Plausible
5503,25,99_5,Was Glamazon a nickname for Beth Phoenix?,Beth Phoenix's nickname was Glamazon.,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible
5504,19,99_5,Was Glamazon a nickname for Beth Phoenix?,Beth Phoenix's nickname was Glamazon.,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible
5505,7,99_5,Was Glamazon a nickname for Beth Phoenix?,"Glamazon was a nickname for Beth Phoenix, whic...",torch-scai-qrecc21-test-dataset-2021-07-20-tor...,,,Plausible


In [46]:
# mark all malformed answer judged by the quality evaluator by ids
malformed_answers_ids = set(df[df.Approve == "malformed answer"]["Input.id"])
print(len(malformed_answers_ids))
df.loc[df['Input.id'].isin(malformed_answers_ids), 'Annotation'] = 'Malformed'
len(df)

14


2078

In [47]:
df_malformed = df[df['Annotation']=='Malformed']

# drop duplicates
df_malformed = df_malformed.drop_duplicates(subset=['Input.id', 'Input.predicted'], keep='last')
print(len(df_malformed))

# df_malformed.head()
df_malformed.groupby(['Input.model']).size().sort_values(ascending=False)

108


Input.model
torch-scai-qrecc21-test-dataset-2021-07-20-torch-usi_T5_raw2-2021-09-20-05-42-29                                                                  16
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-BART-large-top1BM25-2021-09-13-17-47-43                                              11
rali-scai-qrecc21-test-dataset-rewritten-2021-07-20-rali-qa-no-run-identifier-2021-09-09-13-01-07                                                 11
gpt3-scai-qrecc21-test-dataset-2021-07-20-scai-qrecc21-gpt3-baseline-no-run-identifier-2021-07-21-08-56-42                                         8
ultron-scai-qrecc21-test-dataset-2021-07-20-ultron-no-run-identifier-2021-09-04-17-16-58                                                           7
rachael-scai-qrecc21-test-dataset-2021-07-20-rachael-no-run-identifier-2021-09-15-09-07-49                                                         6
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-no-run-identifier-2021-09-08-15-0

In [45]:
from collections import defaultdict, Counter

correct_answers = defaultdict(list)
models = Counter()
for index, row in df_malformed.iterrows():
    answer = row['Input.predicted']
    model = row['Input.model']
    if answer not in correct_answers[row['Input.question']]:
        correct_answers[row['Input.question']].append(answer)
        models[model] += 1

for m, c in models.most_common(200):
    print(m, c)

torch-scai-qrecc21-test-dataset-2021-07-20-torch-usi_T5_raw2-2021-09-20-05-42-29 16
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-BART-large-top1BM25-2021-09-13-17-47-43 11
rali-scai-qrecc21-test-dataset-rewritten-2021-07-20-rali-qa-no-run-identifier-2021-09-09-13-01-07 11
gpt3-scai-qrecc21-test-dataset-2021-07-20-scai-qrecc21-gpt3-baseline-no-run-identifier-2021-07-21-08-56-42 8
ultron-scai-qrecc21-test-dataset-2021-07-20-ultron-no-run-identifier-2021-09-04-17-16-58 7
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-rag-bm25_100-ground_truth_rewrites-submission_correct_turn_no-2021-09-19-18-40-28 6
rachael-scai-qrecc21-test-dataset-2021-07-20-rachael-no-run-identifier-2021-09-15-09-07-49 6
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-no-run-identifier-2021-09-08-15-04-28 6
rachael-scai-qrecc21-test-dataset-2021-07-20-rachael-no-run-identifier-2021-09-08-07-07-57 5
rachael-scai-qrecc21-test-dataset-2021-07-20-rachael-no-run-identifier-2021-09-

In [7]:
# remove all duplicate rows where the judgments agreed for the same id, answer and annotation
df = df.drop_duplicates(subset=['Input.id', 'Input.predicted', 'Annotation'], keep='last')
len(df)

2618

In [8]:
df

Unnamed: 0,WorkTimeInSeconds,Input.id,Input.question,Input.predicted,Input.model,Answer.comments,Approve,Annotation
0,18,1_1,What is a physician's assistant?,A physician assistant is a person who has succ...,ultron-scai-qrecc21-test-dataset-rewritten-202...,NICE,,Plausible
1,20,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-rewritten-20...,,,Plausible
2,83,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible
3,31,1_1,What is a physician's assistant?,A physician's assistant is a person who assist...,gpt3-scai-qrecc21-test-dataset-2021-07-20-scai...,It is very interesting to the survey.,,Plausible
4,56,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-rewritten-20...,It's really interesting,,Plausible
...,...,...,...,...,...,...,...,...
5500,5,99_5,Was Glamazon a nickname for Beth Phoenix?,Glamazon is a nickname for WWE Hall of Famer B...,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible
5501,16,99_5,Was Glamazon a nickname for Beth Phoenix?,glamazon is so-named because of beth phoenix's...,ultron-scai-qrecc21-test-dataset-rewritten-202...,,,Malformed
5502,19,99_5,Was Glamazon a nickname for Beth Phoenix?,glamazon is so-named because of beth phoenix's...,ultron-scai-qrecc21-test-dataset-rewritten-202...,,,Plausible
5504,19,99_5,Was Glamazon a nickname for Beth Phoenix?,Beth Phoenix's nickname was Glamazon.,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible


In [9]:
# save resulting table with disagreements that were not resolved manually by the quality evaluator
df.to_csv('answer_plausibility_annotations_clean_with_disagreements.csv')

In [14]:
# remove all rows with disagreements automatically and save as a new table
# ie all duplicate rows on the first two columns that were not removed at the previous step
df = df.drop_duplicates(subset=['Input.id', 'Input.predicted'], keep=False)
df.to_csv('answer_plausibility_annotations_clean_without_disagreements.csv')
len(df)

2078

In [51]:
# load generated answers with plausibility annotations
# download file from https://zenodo.org/record/5752542#.Yau-Li8RppQ
import pandas as pd

df = pd.read_csv('../annotations/answer_plausibility_annotations_clean_without_disagreements.csv')
print(len(df), 'rows')

# subset of all plausible answers
df_plausible = df[df['Annotation']=='Plausible']
print(len(df_plausible), 'rows')

2078 rows
1863 rows


In [52]:
# subset of all plausible answers
df_implausible = df[df['Annotation']=='Plausible']
print(len(df_implausible), 'rows')
df_implausible.groupby(['Input.model']).size().sort_values(ascending=False)

1863 rows


Input.model
rachael-scai-qrecc21-test-dataset-rewritten-2021-07-20-rachael-no-run-identifier-2021-09-04-10-39-42                                              183
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-rag-bm25_100-ground_truth_rewrites-submission_correct_turn_no-2021-09-19-18-40-28    173
rachael-scai-qrecc21-test-dataset-rewritten-2021-07-20-rachael-no-run-identifier-2021-09-06-09-21-43                                              158
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-no-run-identifier-2021-09-08-15-04-28                                                149
gpt3-scai-qrecc21-test-dataset-2021-07-20-scai-qrecc21-gpt3-baseline-no-run-identifier-2021-07-21-08-56-42                                        149
rachael-scai-qrecc21-test-dataset-2021-07-20-rachael-no-run-identifier-2021-09-08-21-49-44                                                        133
rachael-scai-qrecc21-test-dataset-rewritten-2021-07-20-rachael-no-run-identifier-2021-09

In [54]:
from collections import defaultdict, Counter

correct_answers = defaultdict(list)
models = Counter()
for index, row in df_plausible.iterrows():
    answer = row['Input.predicted']
    model = row['Input.model']
#     if answer not in correct_answers[row['Input.question']]:
    correct_answers[row['Input.question']].append(answer)
    models[model] += 1

for m, c in models.most_common(200):
    print(m, c)

rachael-scai-qrecc21-test-dataset-rewritten-2021-07-20-rachael-no-run-identifier-2021-09-04-10-39-42 183
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-rag-bm25_100-ground_truth_rewrites-submission_correct_turn_no-2021-09-19-18-40-28 173
rachael-scai-qrecc21-test-dataset-rewritten-2021-07-20-rachael-no-run-identifier-2021-09-06-09-21-43 158
gpt3-scai-qrecc21-test-dataset-2021-07-20-scai-qrecc21-gpt3-baseline-no-run-identifier-2021-07-21-08-56-42 149
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-no-run-identifier-2021-09-08-15-04-28 149
rachael-scai-qrecc21-test-dataset-2021-07-20-rachael-no-run-identifier-2021-09-08-21-49-44 133
rachael-scai-qrecc21-test-dataset-rewritten-2021-07-20-rachael-no-run-identifier-2021-09-15-19-36-31 132
rachael-scai-qrecc21-test-dataset-2021-07-20-rachael-no-run-identifier-2021-09-08-07-07-57 120
ultron-scai-qrecc21-test-dataset-2021-07-20-ultron-no-run-identifier-2021-09-04-17-16-58 117
rachael-scai-qrecc21-test-dataset-2021-07

In [48]:
# subset of all plausible answers
df_implausible = df[df['Annotation']=='Implausible']
print(len(df_implausible), 'rows')
df_implausible.groupby(['Input.model']).size().sort_values(ascending=False)

107 rows


Input.model
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-no-run-identifier-2021-09-08-15-04-28                                                16
ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-rag-bm25_100-ground_truth_rewrites-submission_correct_turn_no-2021-09-19-18-40-28    15
ultron-scai-qrecc21-test-dataset-2021-07-20-ultron-no-run-identifier-2021-09-04-17-16-58                                                          13
torch-scai-qrecc21-test-dataset-2021-07-20-torch-usi_T5_raw2-2021-09-20-05-42-29                                                                   7
rali-scai-qrecc21-test-dataset-rewritten-2021-07-20-rali-qa-no-run-identifier-2021-09-09-13-01-07                                                  6
rachael-scai-qrecc21-test-dataset-2021-07-20-rachael-no-run-identifier-2021-09-08-15-40-34                                                         6
rachael-scai-qrecc21-test-dataset-2021-07-20-rachael-no-run-identifier-2021-09-08-21-49-44    

In [7]:
print(len(correct_answers))

494


In [10]:
import numpy as np

cluster_lens = []
n = 0
for cluster in correct_answers.values():
    cluster_lens.append(len(cluster))
    if len(cluster) > 1:
        n += 1
print(min(cluster_lens), np.mean(cluster_lens), max(cluster_lens))
print(n, 'questions with more than one plausible answers')

1 3.7712550607287447 13
465 questions with more than one plausible answers


In [15]:
# samples
from collections import defaultdict

correct_answers = defaultdict(list)
for index, row in df_plausible.iterrows():
    answer = row['Input.predicted']
    model = row['Input.model']
    if answer not in correct_answers[row['Input.question']]:
        correct_answers[row['Input.question']].append(answer)
print(len(correct_answers))

for q, ass in correct_answers.items():
    print(q)
    for a in ass:
        print(a)
    break

492
What is a physician's assistant?
A physician assistant is a person who has successfully completed an accredited education program for physician assistant, is licensed by the state and is practicing within the scope of that license.
A physician's assistant (PA) is a medical assistant who works under the supervision of a physician and is licensed to practice medicine in the state in which the patient resides.
A physician's assistant (PA) is a medical assistant.
A physician's assistant is a person who assists a physician in the performance of his or her duties.
A physician's assistant (PA) is a medical assistant who works for a supervising physician who is responsible for patients, cared for by the physician assistant.
A physician assistant is a medical professional who assists a doctor in the diagnosis and treatment of a patient.
A physician's assistant is a medical assistant.
A physician's assistant is a medical professional who provides medical care to patients.
A physician's assis