HD stands for hallucination detection.

We utilize 
- the development split of CoQA with 7983 QA pairs, 
- the validation split of NQ with 3610 QA pairs, 
- the validation split of the TriviaQA (rc.nocontext subset) with 9,960 deduplicated QA pairs.
- For the SQuAD dataset, we filter out the QA pairs with their flag is impossible = True, and utilize the subset of the developmentv2.0 split with 5928 QA pairs.

In [1]:
%cd ..

/home/qwj/code/HDInstruct


In [2]:
import os
os.environ["HF_HOME"] = "/home/qwj/hfcache"
os.environ['DATASETS_OFFLINE'] = '1'
import json

In [3]:
# from datasets import Features, GeneratorBasedBuilder, DatasetInfo, SplitGenerator
import datasets

## CoQA

In [4]:
def generate_coqa_examples(filepath = "./data/raw/coqa-dev-v1.0.json"):
    data = json.load(open(filepath))
    for story in data['data']:
        story_id = story['id']
        context = story['story']
        for i, (q, a) in enumerate(zip(story['questions'], story['answers'])):
            additional_answers = [story['additional_answers'][str(j)][i]['input_text'] for j in range(3)]
            answers = list(set([a['input_text']] + additional_answers))
            yield  {
                "id": f"coqa_{story_id}_{i}",
                "context": context,
                "question": q['input_text'],
                "ground_truth": answers,
            }

In [5]:
def save_preprocessed_data_coqa(filepath, output_path):
    processed_data = list(generate_coqa_examples(filepath))
    with open(output_path, 'w') as f:
        json.dump(processed_data, f, ensure_ascii=False, indent=4)
    print(f"save preprocessed data to {output_path}, total {len(processed_data)} examples")

save_preprocessed_data_coqa("./data/raw/coqa-dev-v1.0.json", "./data/processed/coqa.json")

save preprocessed data to ./data/processed/coqa.json, total 7983 examples


## natural questions

In [7]:
# if you can reach huggingface , you can load_dataset directly
nq_ds = datasets.load_dataset("nq_open", split='validation', cache_dir="/home/qwj/hfcache/datasets")

Using the latest cached version of the dataset since nq_open couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'nq_open' at /home/qwj/hfcache/datasets/nq_open/nq_open/0.0.0/3e24b5c209e8f578bd6f5ee795167a3577674383 (last modified on Fri Mar  1 09:15:18 2024).


In [8]:
def generate_nq_examples(nq_ds):
    for i, example in enumerate(nq_ds):
        yield {
            "id": f"nq_{i}",
            "question": example['question'],
            "ground_truth": example['answer'],
        }

def save_preprocessed_data_nq(nq_ds, output_path):
    processed_data = list(generate_nq_examples(nq_ds))
    with open(output_path, 'w') as f:
        json.dump(processed_data, f, ensure_ascii=False, indent=4)
    print(f"save preprocessed data to {output_path}, total {len(processed_data)} examples")

save_preprocessed_data_nq(nq_ds, "./data/processed/nq.json")

save preprocessed data to ./data/processed/nq.json, total 3610 examples


## TriviaQA

In [4]:
triviaqa_ds = datasets.load_dataset("trivia_qa", "rc.nocontext", split='validation', cache_dir="/home/qwj/hfcache/datasets")
triviaqa_ds

Using the latest cached version of the dataset since trivia_qa couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'rc.nocontext' at /home/qwj/hfcache/datasets/trivia_qa/rc.nocontext/0.0.0/0f7faf33a3908546c6fd5b73a660e0f8ff173c2f (last modified on Wed Mar  6 12:47:32 2024).


Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
    num_rows: 17944
})

In [13]:
from collections import Counter
qid_counter = Counter()
for example in triviaqa_ds:
    qid_counter[example['question_id']] += 1
qid_counter.most_common(10)

[('tc_33', 2),
 ('tc_40', 2),
 ('tc_49', 2),
 ('tc_56', 2),
 ('tc_106', 2),
 ('tc_137', 2),
 ('tc_217', 2),
 ('tc_219', 2),
 ('tc_241', 2),
 ('tc_261', 2)]

In [5]:
from collections import defaultdict

def generate_triviaqa_examples(triviaqa_ds):
    qid_record = defaultdict(list)
    for i, example in enumerate(triviaqa_ds):
        qid_record[example['question_id']].append(i)
    for qid, indices in qid_record.items():
        # assert questions are the same
        all_questions, all_answers = set(), set()
        for i in indices:
            all_questions.add(triviaqa_ds[i]['question'])
            all_answers.add(triviaqa_ds[i]['answer']['value'])
            for alias in triviaqa_ds[i]['answer']['aliases']:
                all_answers.add(alias)
        if len(all_questions) != 1:
            print(f"question {qid}, {indices} has multiple questions:\n",'\n'.join(all_questions))
            continue
        yield {
            "id": f"triviaqa_{qid}",
            "question": list(all_questions),
            "ground_truth": list(all_answers),
        }

shit = list(generate_triviaqa_examples(triviaqa_ds))

question bb_956, [2767, 12171] has multiple questions:
 Name Microsoft's hands-free gaming system launched in June 2010, a made-up word alluding to joining?
Name Microsoft's hands-free gaming system launched in November 2010, a made-up word alluding to joining?


In [25]:
triviaqa_ds[2767]['answer']['value'], triviaqa_ds[12171]['answer']['value']

('Kinect', 'Kinect')

According to Wikipedia

Release date	Xbox 360
NA: November 4, 2010[2]
EU: November 10, 2010[1]
COL: November 14, 2010[3]
AU: November 18, 2010[4]
JP: November 20, 2010[5]

In [7]:
from collections import defaultdict

def generate_triviaqa_examples(triviaqa_ds):
    qid_record = defaultdict(list)
    for i, example in enumerate(triviaqa_ds):
        if i == 2767:
            print('skip one unexpected error question:', i, example['question'])
            continue
        qid_record[example['question_id']].append(i)
    for qid, indices in qid_record.items():
        # assert questions are the same
        all_questions, all_answers = set(), set()
        for i in indices:
            all_questions.add(triviaqa_ds[i]['question'])
            all_answers.add(triviaqa_ds[i]['answer']['value'])
            for alias in triviaqa_ds[i]['answer']['aliases']:
                all_answers.add(alias)
        assert len(all_questions) == 1
        yield {
            "id": f"triviaqa_{qid}",
            "question": list(all_questions)[0],
            "ground_truth": list(all_answers),
        }

processed_data = list(generate_triviaqa_examples(triviaqa_ds))
print(f"total {len(processed_data)} examples")
with open("./data/processed/triviaqa.json", 'w') as f:
    json.dump(processed_data, f, ensure_ascii=False, indent=4)
print(f"save preprocessed data to ./data/processed/triviaqa.json")

skip one unexpected error question: 2767 Name Microsoft's hands-free gaming system launched in June 2010, a made-up word alluding to joining?
total 9960 examples
save preprocessed data to ./data/processed/triviaqa.json


## Squad

In [12]:
def generate_squad_examples(filepath = "./data/raw/squad-dev-v2.0.json"):
    data = json.load(open(filepath))
    for article in data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                if qa['is_impossible']:
                    continue
                question = qa['question']
                answers = set([a['text'] for a in qa['answers']])
                yield {
                    "id": f"squad_{qa['id']}",
                    "context": context,
                    "question": question,
                    "ground_truth": list(answers),
                }

processed_data = list(generate_squad_examples())
print(f"total {len(processed_data)} examples")
with open("./data/processed/squad.json", 'w') as f:
    json.dump(processed_data, f, ensure_ascii=False, indent=4)
print(f"save preprocessed data to ./data/processed/squad.json")

total 5928 examples
save preprocessed data to ./data/processed/squad.json


---

generate 100 samples from each dataset for debug

In [14]:
import random 

random.seed(0)

# for dname in ['coqa', 'nq', 'triviaqa', 'squad']:
for dname in ['triviaqa']:
    with open(f"./data/processed/{dname}.json") as f:
        data = json.load(f)
    # select 100 examples for each dataset
    selected = random.sample(data, 100)
    with open(f"./data/debug/{dname}_sample.json", 'w') as f:
        json.dump(selected, f, ensure_ascii=False, indent=4)