In [1]:
# load plausibility annotations
# download file from https://zenodo.org/record/5752542#.Yau-Li8RppQ
import pandas as pd

df = pd.read_csv('answer_plausibility_annotations_original.csv')
# print(df.head())
print(len(df), 'rows')

5507 rows


In [2]:
# remove all rows where Approve=="random answer" ie classified as invalid judgements
# which means the worker's answer was rejected by the quality control evaluator
df = df[df.Approve != "random answer"]
print(len(df), 'rows')

5387 rows


In [3]:
# remove rows with the answers that were not judged as "correct answer" by the quality control evaluator
remove_answers = set(df[df.Approve == "correct answer"]["Input.predicted"])
# filter out by those ids and not marked as correct
df = df[~((df['Input.predicted'].isin(remove_answers)) & (df.Approve!="correct answer"))]
print(len(df), 'rows')

5317 rows


In [4]:
# remove all samples where at least one of the rows was marked as "confusing answer"
# which means that the sample is hard to decide upon as judged by the quality control evaluator
remove_answers = df[df.Approve == "confusing answer"]["Input.predicted"]
df = df[~df['Input.predicted'].isin(remove_answers)]
print(len(df), 'rows')

5015 rows


In [5]:
# replace the radio button results with a single column
df.loc[df['Answer.answer.on'] == True, 'Annotation'] = 'Plausible'
df.loc[df['Answer.noanswer.on'] == True, 'Annotation'] = 'Implausible'
df.loc[df['Answer.malformed.on'] == True, 'Annotation'] = 'Malformed'
# drop old columns
df = df.drop(['Answer.answer.on', 'Answer.noanswer.on', 'Answer.malformed.on', 'Reject'], axis = 1)
df

Unnamed: 0,WorkTimeInSeconds,Input.id,Input.question,Input.predicted,Input.model,Answer.comments,Approve,Annotation
0,18,1_1,What is a physician's assistant?,A physician assistant is a person who has succ...,ultron-scai-qrecc21-test-dataset-rewritten-202...,NICE,,Plausible
1,20,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-rewritten-20...,,,Plausible
2,83,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible
3,31,1_1,What is a physician's assistant?,A physician's assistant is a person who assist...,gpt3-scai-qrecc21-test-dataset-2021-07-20-scai...,It is very interesting to the survey.,,Plausible
4,56,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-rewritten-20...,It's really interesting,,Plausible
...,...,...,...,...,...,...,...,...
5502,19,99_5,Was Glamazon a nickname for Beth Phoenix?,glamazon is so-named because of beth phoenix's...,ultron-scai-qrecc21-test-dataset-rewritten-202...,,,Plausible
5503,25,99_5,Was Glamazon a nickname for Beth Phoenix?,Beth Phoenix's nickname was Glamazon.,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible
5504,19,99_5,Was Glamazon a nickname for Beth Phoenix?,Beth Phoenix's nickname was Glamazon.,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible
5505,7,99_5,Was Glamazon a nickname for Beth Phoenix?,"Glamazon was a nickname for Beth Phoenix, whic...",torch-scai-qrecc21-test-dataset-2021-07-20-tor...,,,Plausible


In [6]:
# mark all malformed answer judged by the quality evaluator by ids
malformed_answers_ids = set(df[df.Approve == "malformed answer"]["Input.id"])
print(len(malformed_answers_ids))
df.loc[df['Input.id'].isin(malformed_answers_ids), 'Annotation'] = 'Malformed'
len(df)

16


5015

In [7]:
# remove all duplicate rows where the judgments agreed for the same id, answer and annotation
df = df.drop_duplicates(subset=['Input.id', 'Input.predicted', 'Annotation'], keep='last')
len(df)

2618

In [8]:
df

Unnamed: 0,WorkTimeInSeconds,Input.id,Input.question,Input.predicted,Input.model,Answer.comments,Approve,Annotation
0,18,1_1,What is a physician's assistant?,A physician assistant is a person who has succ...,ultron-scai-qrecc21-test-dataset-rewritten-202...,NICE,,Plausible
1,20,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-rewritten-20...,,,Plausible
2,83,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible
3,31,1_1,What is a physician's assistant?,A physician's assistant is a person who assist...,gpt3-scai-qrecc21-test-dataset-2021-07-20-scai...,It is very interesting to the survey.,,Plausible
4,56,1_1,What is a physician's assistant?,A physician's assistant (PA) is a medical assi...,rachael-scai-qrecc21-test-dataset-rewritten-20...,It's really interesting,,Plausible
...,...,...,...,...,...,...,...,...
5500,5,99_5,Was Glamazon a nickname for Beth Phoenix?,Glamazon is a nickname for WWE Hall of Famer B...,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible
5501,16,99_5,Was Glamazon a nickname for Beth Phoenix?,glamazon is so-named because of beth phoenix's...,ultron-scai-qrecc21-test-dataset-rewritten-202...,,,Malformed
5502,19,99_5,Was Glamazon a nickname for Beth Phoenix?,glamazon is so-named because of beth phoenix's...,ultron-scai-qrecc21-test-dataset-rewritten-202...,,,Plausible
5504,19,99_5,Was Glamazon a nickname for Beth Phoenix?,Beth Phoenix's nickname was Glamazon.,rachael-scai-qrecc21-test-dataset-2021-07-20-r...,,,Plausible


In [9]:
# save resulting table with disagreements that were not resolved manually by the quality evaluator
df.to_csv('answer_plausibility_annotations_clean_with_disagreements.csv')

In [14]:
# remove all rows with disagreements automatically and save as a new table
# ie all duplicate rows on the first two columns that were not removed at the previous step
df = df.drop_duplicates(subset=['Input.id', 'Input.predicted'], keep=False)
df.to_csv('answer_plausibility_annotations_clean_without_disagreements.csv')
len(df)

2078