In [6]:
import random
from datasets import load_dataset

random.seed(42)  # 设置随机种子以确保结果可复现
# 下载 XSum 数据集的训练集
# 自然问答，文本摘要
xsum = load_dataset("xsum", split="train")
print(xsum)
trivaqa = load_dataset("mandarjoshi/trivia_qa", "rc.nocontext", split="train")
print(trivaqa)

# 数学推理
gsm8k = load_dataset("openai/gsm8k","main", split="train")
print(gsm8k)
# 代码能力
humaneval = load_dataset("openai/openai_humaneval", split="test")
print(humaneval)

# 多语言，翻译
xglue = load_dataset("gsarti/flores_101", "all", split="dev")
print(xglue)


Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 204045
})
Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
    num_rows: 138384
})
Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})
Dataset({
    features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],
    num_rows: 164
})
Dataset({
    features: ['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink', 'sentence_afr', 'sentence_amh', 'sentence_ara', 'sentence_hye', 'sentence_asm', 'sentence_ast', 'sentence_azj', 'sentence_bel', 'sentence_ben', 'sentence_bos', 'sentence_bul', 'sentence_mya', 'sentence_cat', 'sentence_ceb', 'sentence_zho_simpl', 'sentence_zho_trad', 'sentence_hrv', 'sentence_ces', 'sentence_dan', 'sentence_nld', 'sentence_eng', 'sentence_est', 'sentence_tgl', 'sentence_fin', 'sentence_fra', 'sentence_ful', 'sentence_glg', 'sentence_lug', 'sentence_kat', 'sentence_deu', 'sentence_ell', 'sen

In [3]:
xsum_sample = random.sample(list(xsum), 100)
triviaqa_sample = random.sample(list(trivaqa), 100)
gsm8k_sample = random.sample(list(gsm8k), 100)
humaneval_sample = random.sample(list(humaneval), 100)
flores_sample = random.sample(list(xglue), 100)

In [5]:
xsum_sample[0]['document']

'Private Harry Vasey, who was part of the 1st Airborne Battalion, The Border Regiment, was killed during Operation Market Garden in Oosterbeek in 1944.\nNow his identity has been confirmed, the Ministry of Defence (MoD) want to trace his family so his grave can be rededicated in the Netherlands.\nThe MoD said plans were also in place to change his headstone.\nBorn in Durham in May 1916 to Harry Vasey and Annie Young, he enlisted in April 1940 when he lived in Bowburn, County Durham.\nAn MoD spokesman said: "Unfortunately that is about all we know about Private Vasey and his family and that\'s where the trail goes cold.\n"We are hoping that there are some of his family still living in that area."\nSince WW2, a section of the Royal Netherlands Army has been working to identify the graves of unknown soldiers killed in battle.\nThe exhumation reports were scrutinised for clues to the identities of these men and the research was presented to the MoD.\nMr Vasey is one of six Border Regiment 

In [9]:
import os

import re

def sanitize_filename(name):
    """将非法文件名字符替换为下划线"""
    return re.sub(r'[\\/*?:"<>|]', '_', str(name))

# 保存函数
def save_text_samples(samples, key, name_fn, folder):
    os.makedirs(folder, exist_ok=True)
    for i, sample in enumerate(samples):
        file_id = name_fn(sample, i)
        file_id = sanitize_filename(file_id)
        with open(os.path.join(folder, f"{file_id}.txt"), "w", encoding="utf-8") as f:
            f.write(sample[key])

# 保存路径
base_dir = "./prompt_datasets"
paths = {
    "xsum": os.path.join(base_dir, "xsum"),
    "triviaqa": os.path.join(base_dir, "triviaqa"),
    "gsm8k": os.path.join(base_dir, "gsm8k"),
    "humaneval": os.path.join(base_dir, "humaneval"),
    "flores_101": os.path.join(base_dir, "flores_101"),
}

# 执行保存
save_text_samples(xsum_sample, "document", lambda s, i: s["id"], paths["xsum"])
save_text_samples(triviaqa_sample, "question", lambda s, i: s["question_id"], paths["triviaqa"])
save_text_samples(gsm8k_sample, "question", lambda s, i: str(i), paths["gsm8k"])
save_text_samples(humaneval_sample, "prompt", lambda s, i: s["task_id"], paths["humaneval"])
save_text_samples(flores_sample, "sentence_zho_simpl", lambda s, i: s["id"], paths["flores_101"])

In [11]:
import os

def list_prompt_files_by_dataset(base_dir="./prompt_datasets"):
    dataset_files = {}

    for dataset_name in os.listdir(base_dir):
        dataset_path = os.path.join(base_dir, dataset_name)
        if not os.path.isdir(dataset_path):
            continue

        files = []
        for filename in os.listdir(dataset_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(dataset_path, filename)
                files.append(file_path)

        dataset_files[dataset_name] = sorted(files)  # 可选：排序方便查看

    return dataset_files

# 使用示例
files_by_dataset = list_prompt_files_by_dataset()

# 输出示例
for dataset, files in files_by_dataset.items():
    print(f"\n📂 Dataset: {dataset} ({len(files)} files)")
    for path in files:
        print(f"  - {path}")


📂 Dataset: triviaqa (100 files)
  - ./prompt_datasets/triviaqa/bb_1294.txt
  - ./prompt_datasets/triviaqa/bb_4237.txt
  - ./prompt_datasets/triviaqa/bb_5366.txt
  - ./prompt_datasets/triviaqa/bb_5933.txt
  - ./prompt_datasets/triviaqa/bb_6469.txt
  - ./prompt_datasets/triviaqa/bt_1205.txt
  - ./prompt_datasets/triviaqa/bt_1809.txt
  - ./prompt_datasets/triviaqa/bt_1876.txt
  - ./prompt_datasets/triviaqa/bt_2568.txt
  - ./prompt_datasets/triviaqa/bt_2575.txt
  - ./prompt_datasets/triviaqa/bt_2586.txt
  - ./prompt_datasets/triviaqa/bt_3003.txt
  - ./prompt_datasets/triviaqa/bt_3627.txt
  - ./prompt_datasets/triviaqa/bt_3912.txt
  - ./prompt_datasets/triviaqa/bt_923.txt
  - ./prompt_datasets/triviaqa/dpql_3288.txt
  - ./prompt_datasets/triviaqa/dpql_4065.txt
  - ./prompt_datasets/triviaqa/dpql_4110.txt
  - ./prompt_datasets/triviaqa/dpql_5185.txt
  - ./prompt_datasets/triviaqa/dpql_6211.txt
  - ./prompt_datasets/triviaqa/jp_1483.txt
  - ./prompt_datasets/triviaqa/jp_1960.txt
  - ./prompt