In [None]:
from datasets import load_dataset

qasper_dataset = load_dataset(
    "allenai/qasper",
    cache_dir="./hf_cache",
    trust_remote_code=True,

)
print(len(qasper_dataset['train']))


Generating train split: 100%|██████████| 888/888 [00:00<00:00, 6417.46 examples/s]
Generating validation split: 100%|██████████| 281/281 [00:00<00:00, 5650.40 examples/s]
Generating test split: 100%|██████████| 416/416 [00:00<00:00, 6046.01 examples/s]

888





In [25]:
train_set = qasper_dataset['train']
print("数据集加载完成！")

# --- 2. 随机抽样 ---
num_samples = 200
# 使用 .shuffle() 来打乱数据集，然后用 .select() 抽取前200个
# 设置 seed=42 可以确保每次随机抽样的结果都一样，方便复现
print(f"\n正在从训练集中随机抽取 {num_samples} 行数据...")
sampled_dataset = train_set.shuffle(seed=42).select(range(num_samples))
print(f"成功抽取 {len(sampled_dataset)} 行数据。")

# --- 3. 统计问题总数 ---
total_questions = 0
# 遍历抽样后的数据集
for example in sampled_dataset:
    # 累加每个样本中 'qas'->'question' 列表的长度
    total_questions += len(example['qas']['question'])

print(f"\n这 {num_samples} 行数据中总共包含 {total_questions} 个问题。")

# --- 4. 转换为DataFrame并移除不需要的列 ---
print("\n正在将数据转换为 Pandas DataFrame 并移除 'full_text' 字段...")
# 使用 .remove_columns() 方法可以高效地移除列
df = sampled_dataset.remove_columns(['full_text']).to_pandas()

print(df.shape)


数据集加载完成！

正在从训练集中随机抽取 200 行数据...
成功抽取 200 行数据。

这 200 行数据中总共包含 590 个问题。

正在将数据转换为 Pandas DataFrame 并移除 'full_text' 字段...
(200, 5)


In [27]:
id_list = df['id'].tolist()

save_id_path = "./qasper_id_list.txt"
with open(save_id_path, "w") as f:
    for _id in id_list:
        f.write(f"{_id}\n")
print(f"\n已将 ID 列表保存到 {save_id_path}，共 {len(id_list)} 个 ID。")


已将 ID 列表保存到 ./qasper_id_list.txt，共 200 个 ID。


In [43]:
import pandas as pd

# 假设 df 已经从之前的单元格中定义
# 如果没有，请确保运行之前的单元格来定义 df

expanded_rows = []

for idx, row in df.iterrows():
    qas = row['qas']
    questions = qas['question']
    question_ids = qas['question_id']
    nlp_bg = qas['nlp_background']
    topic_bg = qas['topic_background']
    paper_read = qas['paper_read']
    search_query = qas['search_query']
    question_writer = qas['question_writer']
    answers = qas['answers']
    
    for i in range(len(questions)):
        new_row = row.copy()
        new_row['question'] = questions[i]
        new_row['question_id'] = question_ids[i]
        new_row['nlp_background'] = nlp_bg[i]
        new_row['topic_background'] = topic_bg[i]
        new_row['paper_read'] = paper_read[i]
        new_row['search_query'] = search_query[i]
        new_row['question_writer'] = question_writer[i]
        new_row['answers_ori'] = answers[i]
        expanded_rows.append(new_row)

expanded_df = pd.DataFrame(expanded_rows)

# 移除原来的 'qas' 列
expanded_df = expanded_df.drop(columns=['qas'])

expanded_df['answer'] = expanded_df['answers_ori'].apply(lambda x: x['answer'])

print(f"展开后的DataFrame形状: {expanded_df.shape}")
expanded_df.head(2)

展开后的DataFrame形状: (590, 13)


Unnamed: 0,id,title,abstract,figures_and_tables,question,question_id,nlp_background,topic_background,paper_read,search_query,question_writer,answers_ori,answer
0,1909.08402,Enriching BERT with Knowledge Graph Embeddings...,"In this paper, we focus on the classification ...",{'caption': ['Table 1: Availability of additio...,By how much do they outperform standard BERT?,f5cf8738e8d211095bb89350ed05ee7f9997eb19,five,familiar,no,,5053f146237e8fc8859ed3984b5d3f02f39266b7,"{'answer': [{'unanswerable': False, 'extractiv...","[{'unanswerable': False, 'extractive_spans': [..."
0,1909.08402,Enriching BERT with Knowledge Graph Embeddings...,"In this paper, we focus on the classification ...",{'caption': ['Table 1: Availability of additio...,What dataset do they use?,bed527bcb0dd5424e69563fba4ae7e6ea1fca26a,five,familiar,no,,5053f146237e8fc8859ed3984b5d3f02f39266b7,"{'answer': [{'unanswerable': False, 'extractiv...","[{'unanswerable': False, 'extractive_spans': [..."


In [None]:
import uuid
import os
pdf_path = "xxxx/qasper/documents"

expanded_df['doc_uuid'] = expanded_df['id'].apply(lambda x: str(uuid.uuid5(uuid.NAMESPACE_DNS, x)))
expanded_df['doc_path'] = expanded_df.apply(lambda row: os.path.join(pdf_path, row["id"] + ".pdf"), axis=1)
print(expanded_df.shape)
expanded_df.head(2)


(590, 15)


Unnamed: 0,id,title,abstract,figures_and_tables,question,question_id,nlp_background,topic_background,paper_read,search_query,question_writer,answers_ori,answer,doc_uuid,doc_path
0,1909.08402,Enriching BERT with Knowledge Graph Embeddings...,"In this paper, we focus on the classification ...",{'caption': ['Table 1: Availability of additio...,By how much do they outperform standard BERT?,f5cf8738e8d211095bb89350ed05ee7f9997eb19,five,familiar,no,,5053f146237e8fc8859ed3984b5d3f02f39266b7,"{'answer': [{'unanswerable': False, 'extractiv...","[{'unanswerable': False, 'extractive_spans': [...",df4a3d95-a9f7-58ee-8584-48b8ff161556,/mnt/data/wangshu/mmrag/qasper/documents/1909....
0,1909.08402,Enriching BERT with Knowledge Graph Embeddings...,"In this paper, we focus on the classification ...",{'caption': ['Table 1: Availability of additio...,What dataset do they use?,bed527bcb0dd5424e69563fba4ae7e6ea1fca26a,five,familiar,no,,5053f146237e8fc8859ed3984b5d3f02f39266b7,"{'answer': [{'unanswerable': False, 'extractiv...","[{'unanswerable': False, 'extractive_spans': [...",df4a3d95-a9f7-58ee-8584-48b8ff161556,/mnt/data/wangshu/mmrag/qasper/documents/1909....


In [45]:
prior_cols = [
    'id', 'question_id', 'question', 'answer', 'doc_uuid', 'doc_path']

reorder = prior_cols + expanded_df.columns.difference(prior_cols).tolist()

expanded_df = expanded_df[reorder]

print(expanded_df.shape)
expanded_df.head(2)

(590, 15)


Unnamed: 0,id,question_id,question,answer,doc_uuid,doc_path,abstract,answers_ori,figures_and_tables,nlp_background,paper_read,question_writer,search_query,title,topic_background
0,1909.08402,f5cf8738e8d211095bb89350ed05ee7f9997eb19,By how much do they outperform standard BERT?,"[{'unanswerable': False, 'extractive_spans': [...",df4a3d95-a9f7-58ee-8584-48b8ff161556,/mnt/data/wangshu/mmrag/qasper/documents/1909....,"In this paper, we focus on the classification ...","{'answer': [{'unanswerable': False, 'extractiv...",{'caption': ['Table 1: Availability of additio...,five,no,5053f146237e8fc8859ed3984b5d3f02f39266b7,,Enriching BERT with Knowledge Graph Embeddings...,familiar
0,1909.08402,bed527bcb0dd5424e69563fba4ae7e6ea1fca26a,What dataset do they use?,"[{'unanswerable': False, 'extractive_spans': [...",df4a3d95-a9f7-58ee-8584-48b8ff161556,/mnt/data/wangshu/mmrag/qasper/documents/1909....,"In this paper, we focus on the classification ...","{'answer': [{'unanswerable': False, 'extractiv...",{'caption': ['Table 1: Availability of additio...,five,no,5053f146237e8fc8859ed3984b5d3f02f39266b7,,Enriching BERT with Knowledge Graph Embeddings...,familiar


In [None]:
save_path = "xxxxx/data/qasper.json"

expanded_df.to_json(save_path, orient='records', indent=2)

## dataset info

In [17]:
for example in qasper_dataset['train']:
    for k, v in example.items():
        if k == "full_text":
            continue
        print(f"{k}: {v}")
        if k == "qas":
            qas = v
    break

id: 1909.00694
title: Minimally Supervised Learning of Affective Events Using Discourse Relations
abstract: Recognizing affective events that trigger positive or negative sentiment has a wide range of natural language processing applications but remains a challenging problem mainly because the polarity of an event is not necessarily predictable from its constituent words. In this paper, we propose to propagate affective polarity using discourse relations. Our method is simple and only requires a very small seed lexicon and a large raw corpus. Our experiments using Japanese data show that our method learns affective events effectively without manually labeled data. It also improves supervised learning results when labeled data are small.
qas: {'question': ['What is the seed lexicon?', 'What are the results?', 'How are relations used to propagate polarity?', 'How big is the Japanese data?', 'What are labels available in dataset for supervision?', 'How big are improvements of supervszed l

In [16]:
for k, v in qas.items():
    print(f"{k}, len {len(v)}")

print(qas["answers"][0].keys())
# print(qas["answers"][0].keys())
for q, a in zip(qas['question'], qas['answers']):
    print("Q:", q)
    print("A:", a)

question, len 9
question_id, len 9
nlp_background, len 9
topic_background, len 9
paper_read, len 9
search_query, len 9
question_writer, len 9
answers, len 9
dict_keys(['answer', 'annotation_id', 'worker_id'])
Q: What is the seed lexicon?
A: {'answer': [{'unanswerable': False, 'extractive_spans': [], 'yes_no': None, 'free_form_answer': 'a vocabulary of positive and negative predicates that helps determine the polarity score of an event', 'evidence': ['The seed lexicon consists of positive and negative predicates. If the predicate of an extracted event is in the seed lexicon and does not involve complex phenomena like negation, we assign the corresponding polarity score ($+1$ for positive events and $-1$ for negative events) to the event. We expect the model to automatically learn complex phenomena through label propagation. Based on the availability of scores and the types of discourse relations, we classify the extracted event pairs into the following three types.'], 'highlighted_evide