In [1]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting numpy>=1.17 (from datasets)
  Using cached numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading mult

In [22]:
import pandas as pd
from collections import Counter

train = pd.read_csv("../data/clarity_train.csv")
test = pd.read_csv("../data/clarity_test.csv")


In [23]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Train columns:", list(train.columns))
print("Test columns:", list(test.columns))

Train shape: (3448, 20)
Test shape: (308, 20)
Train columns: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label']
Test columns: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label']


In [25]:
train.iloc[0]
train.head(3).to_dict(orient="list")

{'title': ["The President's News Conference in Hanoi, Vietnam",
  "The President's News Conference in Hanoi, Vietnam",
  "The President's News Conference in Hanoi, Vietnam"],
 'date': ['September 10, 2023', 'September 10, 2023', 'September 10, 2023'],
 'president': ['Joseph R. Biden', 'Joseph R. Biden', 'Joseph R. Biden'],
 'url': ['https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-hanoi-vietnam-0',
  'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-hanoi-vietnam-0',
  'https://www.presidency.ucsb.edu/documents/the-presidents-news-conference-hanoi-vietnam-0'],
 'question_order': [1, 1, 2],
 'interview_question': ['Q. Of the Biden administration. And accused the United States of containing China while pushing for diplomatic talks.How would you respond to that? And do you think President Xi is being sincere about getting the relationship back on track as he bans Apple in China?',
  'Q. Of the Biden administration. And accused the United Sta

In [26]:
def count_missing(df, col):
    s = df[col]
    return s.isna().sum() + ((s.notna()) & (s.astype(str).str.strip() == "")).sum()

for col in train.columns:
    print(col, "missing in train:", count_missing(train, col))

print()

for col in test.columns:
    print(col, "missing in test:", count_missing(test, col))

title missing in train: 0
date missing in train: 0
president missing in train: 0
url missing in train: 0
question_order missing in train: 0
interview_question missing in train: 0
interview_answer missing in train: 0
gpt3.5_summary missing in train: 0
gpt3.5_prediction missing in train: 0
question missing in train: 0
annotator_id missing in train: 0
annotator1 missing in train: 3448
annotator2 missing in train: 3448
annotator3 missing in train: 3448
inaudible missing in train: 0
multiple_questions missing in train: 0
affirmative_questions missing in train: 0
index missing in train: 0
clarity_label missing in train: 0
evasion_label missing in train: 0

title missing in test: 308
date missing in test: 308
president missing in test: 308
url missing in test: 0
question_order missing in test: 0
interview_question missing in test: 0
interview_answer missing in test: 0
gpt3.5_summary missing in test: 308
gpt3.5_prediction missing in test: 308
question missing in test: 0
annotator_id missing in

In [27]:
print("Unique labels:", set(train["clarity_label"]))

Unique labels: {'Clear Non-Reply', 'Clear Reply', 'Ambivalent'}


In [28]:
print("Unique labels:", set(train["evasion_label"]))

Unique labels: {'Explicit', 'Clarification', 'Claims ignorance', 'Implicit', 'Partial/half-answer', 'General', 'Dodging', 'Deflection', 'Declining to answer'}


In [29]:
print("Train label counts:", Counter(train["clarity_label"]))
print("Test label counts:", Counter(test["clarity_label"]))

Train label counts: Counter({'Ambivalent': 2040, 'Clear Reply': 1052, 'Clear Non-Reply': 356})
Test label counts: Counter({'Ambivalent': 206, 'Clear Reply': 79, 'Clear Non-Reply': 23})


In [30]:
print("Train label counts:", Counter(train["evasion_label"]))
print("Test label counts:", Counter(test["evasion_label"]))

Train label counts: Counter({'Explicit': 1052, 'Dodging': 706, 'Implicit': 488, 'General': 386, 'Deflection': 381, 'Declining to answer': 145, 'Claims ignorance': 119, 'Clarification': 92, 'Partial/half-answer': 79})
Test label counts: Counter({nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan

In [None]:
seen = set()
dups = 0
for q, a in zip(train["question"], train["interview_answer"]):
    key = (q, a)
    if key in seen:
        dups += 1
    else:
        seen.add(key)

print("Train duplicate QA pairs:", dups)

Train duplicate QA pairs: 58
