In [1]:
pip install datasets



In [7]:
from datasets import load_dataset
from collections import Counter

dataset = load_dataset("ailsntua/QEvasion")
train = dataset["train"]
test = dataset["test"]

In [8]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label'],
        num_rows: 3448
    })
    test: Dataset({
        features: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label'],
        num_rows: 308
    })
})


In [9]:
print(train.features)

{'title': Value('string'), 'date': Value('string'), 'president': Value('string'), 'url': Value('string'), 'question_order': Value('int64'), 'interview_question': Value('string'), 'interview_answer': Value('string'), 'gpt3.5_summary': Value('string'), 'gpt3.5_prediction': Value('string'), 'question': Value('string'), 'annotator_id': Value('string'), 'annotator1': Value('string'), 'annotator2': Value('string'), 'annotator3': Value('string'), 'inaudible': Value('bool'), 'multiple_questions': Value('bool'), 'affirmative_questions': Value('bool'), 'index': Value('int64'), 'clarity_label': Value('string'), 'evasion_label': Value('string')}


In [11]:
len(train)

3448

In [12]:
len(test)

308

In [13]:
train[0]
train.select(range(3))[:]

{'title': ["The President's News Conference in Hanoi, Vietnam",
  "The President's News Conference in Hanoi, Vietnam",
  "The President's News Conference in Hanoi, Vietnam"],
 'date': ['September 10, 2023', 'September 10, 2023', 'September 10, 2023'],
 'president': ['Joseph R. Biden', 'Joseph R. Biden', 'Joseph R. Biden'],
 'url': ['https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-hanoi-vietnam-0',
  'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-hanoi-vietnam-0',
  'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-hanoi-vietnam-0'],
 'question_order': [1, 1, 2],
 'interview_question': ['Q. Of the Biden administration. And accused the United States of containing China while pushing for diplomatic talks.How would you respond to that? And do you think President Xi is being sincere about getting the relationship back on track as he bans Apple in China?',
  'Q. Of the Biden administration. And accused the United Sta

In [16]:
def count_missing(ds, col):
  return sum(
      x is None or (isinstance(x, str) and x.strip() == "")
      for x in ds[col]
  )

for col in train.column_names:
  print(col, "missing in train:", count_missing(train, col))

print("\n")

for col in test.column_names:
  print(col, "missing in test:", count_missing(test, col))

title missing in train: 0
date missing in train: 0
president missing in train: 0
url missing in train: 0
question_order missing in train: 0
interview_question missing in train: 0
interview_answer missing in train: 0
gpt3.5_summary missing in train: 0
gpt3.5_prediction missing in train: 0
question missing in train: 0
annotator_id missing in train: 0
annotator1 missing in train: 3448
annotator2 missing in train: 3448
annotator3 missing in train: 3448
inaudible missing in train: 0
multiple_questions missing in train: 0
affirmative_questions missing in train: 0
index missing in train: 0
clarity_label missing in train: 0
evasion_label missing in train: 0


title missing in test: 308
date missing in test: 308
president missing in test: 308
url missing in test: 0
question_order missing in test: 0
interview_question missing in test: 0
interview_answer missing in test: 0
gpt3.5_summary missing in test: 308
gpt3.5_prediction missing in test: 308
question missing in test: 0
annotator_id missing i

In [19]:
print("Unique labels:", set(train["clarity_label"]))

Unique labels: {'Clear Non-Reply', 'Clear Reply', 'Ambivalent'}


In [20]:
print("Unique labels:", set(train["evasion_label"]))

Unique labels: {'Partial/half-answer', 'Declining to answer', 'Deflection', 'Dodging', 'Implicit', 'Explicit', 'Claims ignorance', 'General', 'Clarification'}


In [22]:
print("Train label counts:", Counter(train["clarity_label"]))
print("Test label counts:", Counter(test["clarity_label"]))

Train label counts: Counter({'Ambivalent': 2040, 'Clear Reply': 1052, 'Clear Non-Reply': 356})
Test label counts: Counter({'Ambivalent': 206, 'Clear Reply': 79, 'Clear Non-Reply': 23})


In [23]:
print("Train label counts:", Counter(train["evasion_label"]))
print("Test label counts:", Counter(test["evasion_label"]))

Train label counts: Counter({'Explicit': 1052, 'Dodging': 706, 'Implicit': 488, 'General': 386, 'Deflection': 381, 'Declining to answer': 145, 'Claims ignorance': 119, 'Clarification': 92, 'Partial/half-answer': 79})
Test label counts: Counter({'': 308})


In [24]:
seen = set()
dups = 0
for q, a in zip(train["question"], train["interview_answer"]):
    key = (q, a)
    if key in seen:
        dups += 1
    else:
        seen.add(key)
print("Train duplicate QA pairs:", dups)

Train duplicate QA pairs: 58
