In [1]:
%cd ..

/home/qwj/code/HDInstruct


In [2]:
import os
os.environ["HF_HOME"] = "/home/qwj/hfcache"
os.environ['DATASETS_OFFLINE'] = '1'
from datasets import load_dataset
import pandas as pd
import json

## CoQA

In [3]:
# download https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json to ./data/raw
coqa_df = pd.DataFrame.from_dict(json.load(open('./data/raw/coqa-dev-v1.0.json'))['data'])
coqa_df.head()

Unnamed: 0,source,id,filename,story,questions,answers,additional_answers,name
0,mctest,3dr23u6we5exclen4th8uq9rb42tel,mc160.test.41,"Once upon a time, in a barn near a farm house,...","[{'input_text': 'What color was Cotton?', 'tur...","[{'span_start': 59, 'span_end': 93, 'span_text...","{'0': [{'span_start': 68, 'span_end': 93, 'spa...",mc160.test.41
1,mctest,3azhrg4cu4ktme1zh7c2ro3pn2430d,mc500.test.4,Once there was a beautiful fish named Asta. As...,[{'input_text': 'what was the name of the fish...,"[{'span_start': 37, 'span_end': 43, 'span_text...","{'0': [{'span_start': 38, 'span_end': 42, 'spa...",mc500.test.4
2,race,3ioen3p9s7jsqm9zwse0cwyj3kq612,high15012.txt,"My doorbell rings. On the step, I find the eld...","[{'input_text': 'Who is at the door?', 'turn_i...","[{'span_start': 19, 'span_end': 115, 'span_tex...","{'0': [{'span_start': 43, 'span_end': 63, 'spa...",high15012.txt
3,cnn,36v4q8r5zk0iwte84nbw2t3d0qzqmg,cnn_8847b303dd9f9ec6f82a50567b6c3505e5238a09.s...,"(CNN) -- Dennis Farina, the dapper, mustachioe...","[{'input_text': 'Is someone in showbiz?', 'tur...","[{'span_start': 8, 'span_end': 181, 'span_text...","{'0': [{'span_start': 8, 'span_end': 172, 'spa...",cnn_8847b303dd9f9ec6f82a50567b6c3505e5238a09.s...
4,mctest,3a1pq49wvhh8nbtgsb549nn9bzth12,mc500.train.257,Kendra and Quinton travel to and from school e...,[{'input_text': 'Where do Quinton and Kendra t...,"[{'span_start': 0, 'span_end': 54, 'span_text'...","{'0': [{'span_start': 0, 'span_end': 54, 'span...",mc500.train.257


### Restructure

In [4]:
def expand_row(row):
    # 对于每行，生成与问题数相等的行数
    rows = []
    for question_index, question in enumerate(row['questions']):
        # 创建一个字典来表示新的行
        new_row = {
            'id': f"coqa_{row['id']}_{question_index}",
            'story': row['story'],
            'question': question['input_text'],
            'answer': row['answers'][question_index]['input_text'],
            'additional_answers': [row['additional_answers'][str(i)][question_index]['input_text'] for i in range(3)]
        }
        rows.append(new_row)
    return rows

expanded_rows = coqa_df.apply(expand_row, axis=1)
flattened_rows = [row for sublist in expanded_rows for row in sublist]
new_coqa_df = pd.DataFrame(flattened_rows)
new_coqa_df.head()

Unnamed: 0,id,story,question,answer,additional_answers
0,coqa_3dr23u6we5exclen4th8uq9rb42tel_0,"Once upon a time, in a barn near a farm house,...",What color was Cotton?,white,"[white, white, white]"
1,coqa_3dr23u6we5exclen4th8uq9rb42tel_1,"Once upon a time, in a barn near a farm house,...",Where did she live?,in a barn,"[in a barn, in a barn, in a barn near]"
2,coqa_3dr23u6we5exclen4th8uq9rb42tel_2,"Once upon a time, in a barn near a farm house,...",Did she live alone?,no,"[no, No, no]"
3,coqa_3dr23u6we5exclen4th8uq9rb42tel_3,"Once upon a time, in a barn near a farm house,...",Who did she live with?,with her mommy and 5 sisters,"[her mommy and 5 other sisters, her mommy and ..."
4,coqa_3dr23u6we5exclen4th8uq9rb42tel_4,"Once upon a time, in a barn near a farm house,...",What color were her sisters?,orange and white,"[orange with white tiger stripes, orange, orange]"


### Deduplicate Additional Answers

In [5]:
def deduplicate_additional_answers(row):
    # 从额外的答案中删除重复的答案
    additional_answers = set(row['additional_answers'])
    additional_answers.discard(row['answer'])
    return list(additional_answers)

new_coqa_df['additional_answers'] = new_coqa_df.apply(deduplicate_additional_answers, axis=1)
new_coqa_df.head()
    

Unnamed: 0,id,story,question,answer,additional_answers
0,coqa_3dr23u6we5exclen4th8uq9rb42tel_0,"Once upon a time, in a barn near a farm house,...",What color was Cotton?,white,[]
1,coqa_3dr23u6we5exclen4th8uq9rb42tel_1,"Once upon a time, in a barn near a farm house,...",Where did she live?,in a barn,[in a barn near]
2,coqa_3dr23u6we5exclen4th8uq9rb42tel_2,"Once upon a time, in a barn near a farm house,...",Did she live alone?,no,[No]
3,coqa_3dr23u6we5exclen4th8uq9rb42tel_3,"Once upon a time, in a barn near a farm house,...",Who did she live with?,with her mommy and 5 sisters,[her mommy and 5 other sisters]
4,coqa_3dr23u6we5exclen4th8uq9rb42tel_4,"Once upon a time, in a barn near a farm house,...",What color were her sisters?,orange and white,"[orange, orange with white tiger stripes]"


### Unify Column Names

In [7]:
def unify_column_names(row):
    # 将列名统一为小写
    return {
        'id': f"coqa_{row['id']}",
        'context': row['story'],
        'question': row['question'],
        'ground_truth': [row['answer']] + row['additional_answers']
    }

unified_coqa_df = new_coqa_df.apply(unify_column_names, axis=1, result_type='expand')
unified_coqa_df.head()

Unnamed: 0,id,context,question,ground_truth
0,coqa_coqa_3dr23u6we5exclen4th8uq9rb42tel_0,"Once upon a time, in a barn near a farm house,...",What color was Cotton?,[white]
1,coqa_coqa_3dr23u6we5exclen4th8uq9rb42tel_1,"Once upon a time, in a barn near a farm house,...",Where did she live?,"[in a barn, in a barn near]"
2,coqa_coqa_3dr23u6we5exclen4th8uq9rb42tel_2,"Once upon a time, in a barn near a farm house,...",Did she live alone?,"[no, No]"
3,coqa_coqa_3dr23u6we5exclen4th8uq9rb42tel_3,"Once upon a time, in a barn near a farm house,...",Who did she live with?,"[with her mommy and 5 sisters, her mommy and 5..."
4,coqa_coqa_3dr23u6we5exclen4th8uq9rb42tel_4,"Once upon a time, in a barn near a farm house,...",What color were her sisters?,"[orange and white, orange, orange with white t..."


### save 

In [8]:
!mkdir -p ./data/processed

In [10]:
unified_coqa_df.to_json('./data/processed/coqa.json', orient='records', lines=True)

## natural questions

In [13]:
# if you can reach huggingface , you can load_dataset directly
nq_ds = load_dataset("nq_open", split='validation', cache_dir="/home/qwj/hfcache/datasets")

Using the latest cached version of the dataset since nq_open couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'nq_open' at /home/qwj/hfcache/datasets/nq_open/nq_open/0.0.0/3e24b5c209e8f578bd6f5ee795167a3577674383 (last modified on Fri Mar  1 09:15:18 2024).


In [14]:
nq_ds

Dataset({
    features: ['question', 'answer'],
    num_rows: 3610
})

### Unify col names and save

In [15]:
nq_df = pd.DataFrame(nq_ds)
nq_df.head()


Unnamed: 0,question,answer
0,when was the last time anyone was on the moon,"[14 December 1972 UTC, December 1972]"
1,who wrote he ain't heavy he's my brother lyrics,"[Bobby Scott, Bob Russell]"
2,how many seasons of the bastard executioner ar...,"[one, one season]"
3,when did the eagles win last super bowl,[2017]
4,who won last year's ncaa women's basketball,[South Carolina]


In [16]:
new_nq = []
for idx, row in nq_df.iterrows():
    new_nq.append({
        'id': f"nq_{idx}",
        'question': row['question'],
        'ground_truth': row['answer']
    })

new_nq_df = pd.DataFrame(new_nq)
new_nq_df.head()

Unnamed: 0,id,question,ground_truth
0,nq_0,when was the last time anyone was on the moon,"[14 December 1972 UTC, December 1972]"
1,nq_1,who wrote he ain't heavy he's my brother lyrics,"[Bobby Scott, Bob Russell]"
2,nq_2,how many seasons of the bastard executioner ar...,"[one, one season]"
3,nq_3,when did the eagles win last super bowl,[2017]
4,nq_4,who won last year's ncaa women's basketball,[South Carolina]


In [17]:
new_nq_df.to_json('./data/processed/nq.json', orient='records', lines=True)

## TriviaQA

In [18]:
triviaqa_ds = load_dataset("trivia_qa", "rc.nocontext", split='validation', cache_dir="/home/qwj/hfcache/datasets")
triviaqa_ds

Using the latest cached version of the dataset since trivia_qa couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'rc.nocontext' at /home/qwj/hfcache/datasets/trivia_qa/rc.nocontext/0.0.0/0f7faf33a3908546c6fd5b73a660e0f8ff173c2f (last modified on Fri Mar  1 09:39:04 2024).


Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
    num_rows: 17944
})

In [19]:
import pandas as pd
id_mem = set()
def remove_dups(batch):
    if batch['question_id'][0] in id_mem:
        return {_:[] for _ in batch.keys()}
    id_mem.add(batch['question_id'][0])
    return batch

triviaqa_ds = triviaqa_ds.map(remove_dups, batch_size=1, batched=True, load_from_cache_file=False)
assert pd.Series([_['question_id'] for _ in triviaqa_ds]).value_counts().max() == 1
trivia_df = pd.DataFrame(triviaqa_ds)
trivia_df.head()

Map:   0%|          | 0/17944 [00:00<?, ? examples/s]

Unnamed: 0,question,question_id,question_source,entity_pages,search_results,answer
0,Who was the man behind The Chipmunks?,tc_2,http://www.triviacountry.com/,"{'doc_source': [], 'filename': [], 'title': []...","{'description': [], 'filename': [], 'rank': []...","{'aliases': ['David Seville'], 'normalized_ali..."
1,Which Lloyd Webber musical premiered in the US...,tc_33,http://www.triviacountry.com/,"{'doc_source': [], 'filename': [], 'title': []...","{'description': [], 'filename': [], 'rank': []...","{'aliases': ['Sunset Blvd', 'West Sunset Boule..."
2,Who was the next British Prime Minister after ...,tc_40,http://www.triviacountry.com/,"{'doc_source': [], 'filename': [], 'title': []...","{'description': [], 'filename': [], 'rank': []...","{'aliases': ['Sir Henry Campbell-Bannerman', '..."
3,Who had a 70s No 1 hit with Kiss You All Over?,tc_49,http://www.triviacountry.com/,"{'doc_source': [], 'filename': [], 'title': []...","{'description': [], 'filename': [], 'rank': []...","{'aliases': ['Internal exile', 'Exiles', 'Tran..."
4,What claimed the life of singer Kathleen Ferrier?,tc_56,http://www.triviacountry.com/,"{'doc_source': [], 'filename': [], 'title': []...","{'description': [], 'filename': [], 'rank': []...","{'aliases': ['Cancer pathology', 'Deaths by ca..."


### filter columns

In [20]:
def make_trivia_row(row):
    answers = set(row['answer']['aliases'])
    answers.add(row['answer']['value'])
    return {
        'id': f"trivia_{row['question_id']}",
        'question': row['question'],
        'ground_truth': list(answers)
    }

new_trivia_df = trivia_df.apply(make_trivia_row, axis=1, result_type='expand')
new_trivia_df.head()

Unnamed: 0,id,question,ground_truth
0,trivia_tc_2,Who was the man behind The Chipmunks?,[David Seville]
1,trivia_tc_33,Which Lloyd Webber musical premiered in the US...,"[Sunset Blvd., Sunset Blvd, Sunset Bulevard, W..."
2,trivia_tc_40,Who was the next British Prime Minister after ...,"[Campbell-Bannerman, Henry Campbell-Bannerman,..."
3,trivia_tc_49,Who had a 70s No 1 hit with Kiss You All Over?,"[Exile in Greek tragedy, Voluntary exile, Inte..."
4,trivia_tc_56,What claimed the life of singer Kathleen Ferrier?,"[Cancer signs, Epithelial cancers, Cancer, Can..."


### save

In [21]:
new_trivia_df.to_json('./data/processed/trivia.json', orient='records', lines=True)

## Squad

In [23]:
# download https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json to ./data/raw/squad-dev-v2.0.json

squad_df = pd.DataFrame.from_dict(json.load(open('./data/raw/squad-dev-v2.0.json'))['data'])
squad_df.head()

Unnamed: 0,title,paragraphs
0,Normans,[{'qas': [{'question': 'In what country is Nor...
1,Computational_complexity_theory,[{'qas': [{'question': 'What branch of theoret...
2,Southern_California,[{'qas': [{'question': 'What is Southern Calif...
3,Sky_(United_Kingdom),[{'qas': [{'question': 'What company was forme...
4,Victoria_(Australia),[{'qas': [{'question': 'What kind of economy d...


### restructure

In [24]:
def expand_row(row):
    # 对于每行，生成与问题数相等的行数
    rows = []
    for paragraph in row['paragraphs']:
        for qa in paragraph['qas']:
            if qa['is_impossible']:
                continue
            new_row = {
                'id': f"squad_{qa['id']}",
                "question": qa['question'],
                "answers": list(set([answer['text'] for answer in qa['answers']])),
                "context": paragraph['context']
            }
            rows.append(new_row)
    return rows

expanded_rows = squad_df.apply(expand_row, axis=1)
flattened_rows = [row for sublist in expanded_rows for row in sublist]
new_squad_df = pd.DataFrame(flattened_rows)
new_squad_df.head()

Unnamed: 0,id,question,answers,context
0,squad_56ddde6b9a695914005b9628,In what country is Normandy located?,[France],The Normans (Norman: Nourmands; French: Norman...
1,squad_56ddde6b9a695914005b9629,When were the Normans in Normandy?,"[10th and 11th centuries, in the 10th and 11th...",The Normans (Norman: Nourmands; French: Norman...
2,squad_56ddde6b9a695914005b962a,From which countries did the Norse originate?,"[Denmark, Iceland and Norway]",The Normans (Norman: Nourmands; French: Norman...
3,squad_56ddde6b9a695914005b962b,Who was the Norse leader?,[Rollo],The Normans (Norman: Nourmands; French: Norman...
4,squad_56ddde6b9a695914005b962c,What century did the Normans first gain their ...,"[the first half of the 10th century, 10th cent...",The Normans (Norman: Nourmands; French: Norman...


In [25]:
len(new_squad_df)

5928

### unify col names

In [26]:
def unify_column_names(row):
    # 将列名统一为小写
    return {
        'id': f"squad_{row['id']}",
        'context': row['context'],
        'question': row['question'],
        'ground_truth': list(set(row['answers']))
    }

unified_squad_df = new_squad_df.apply(unify_column_names, axis=1, result_type='expand')
unified_squad_df.head()

Unnamed: 0,id,context,question,ground_truth
0,squad_squad_56ddde6b9a695914005b9628,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,[France]
1,squad_squad_56ddde6b9a695914005b9629,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"[10th and 11th centuries, in the 10th and 11th..."
2,squad_squad_56ddde6b9a695914005b962a,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"[Denmark, Iceland and Norway]"
3,squad_squad_56ddde6b9a695914005b962b,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,[Rollo]
4,squad_squad_56ddde6b9a695914005b962c,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,"[the first half of the 10th century, 10th cent..."


### save

In [27]:
unified_squad_df.to_json('./data/processed/squad.json', orient='records', lines=True)