In [1]:
import os
os.environ["HF_HOME"] = "/home/qwj/hfcache"
os.environ['DATASETS_OFFLINE'] = '1'
from datasets import load_dataset
import pandas as pd
import json

# CoQA

In [2]:
# download https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json to ./data
coqa_df = pd.DataFrame.from_dict(json.load(open('./data/coqa-dev-v1.0.json'))['data'])
coqa_df.head()

Unnamed: 0,source,id,filename,story,questions,answers,additional_answers,name
0,mctest,3dr23u6we5exclen4th8uq9rb42tel,mc160.test.41,"Once upon a time, in a barn near a farm house,...","[{'input_text': 'What color was Cotton?', 'tur...","[{'span_start': 59, 'span_end': 93, 'span_text...","{'0': [{'span_start': 68, 'span_end': 93, 'spa...",mc160.test.41
1,mctest,3azhrg4cu4ktme1zh7c2ro3pn2430d,mc500.test.4,Once there was a beautiful fish named Asta. As...,[{'input_text': 'what was the name of the fish...,"[{'span_start': 37, 'span_end': 43, 'span_text...","{'0': [{'span_start': 38, 'span_end': 42, 'spa...",mc500.test.4
2,race,3ioen3p9s7jsqm9zwse0cwyj3kq612,high15012.txt,"My doorbell rings. On the step, I find the eld...","[{'input_text': 'Who is at the door?', 'turn_i...","[{'span_start': 19, 'span_end': 115, 'span_tex...","{'0': [{'span_start': 43, 'span_end': 63, 'spa...",high15012.txt
3,cnn,36v4q8r5zk0iwte84nbw2t3d0qzqmg,cnn_8847b303dd9f9ec6f82a50567b6c3505e5238a09.s...,"(CNN) -- Dennis Farina, the dapper, mustachioe...","[{'input_text': 'Is someone in showbiz?', 'tur...","[{'span_start': 8, 'span_end': 181, 'span_text...","{'0': [{'span_start': 8, 'span_end': 172, 'spa...",cnn_8847b303dd9f9ec6f82a50567b6c3505e5238a09.s...
4,mctest,3a1pq49wvhh8nbtgsb549nn9bzth12,mc500.train.257,Kendra and Quinton travel to and from school e...,[{'input_text': 'Where do Quinton and Kendra t...,"[{'span_start': 0, 'span_end': 54, 'span_text'...","{'0': [{'span_start': 0, 'span_end': 54, 'span...",mc500.train.257


### Restructure

In [3]:
def expand_row(row):
    # 对于每行，生成与问题数相等的行数
    rows = []
    for question_index, question in enumerate(row['questions']):
        # 创建一个字典来表示新的行
        new_row = {
            'id': f"coqa_{row['id']}_{question_index}",
            'story': row['story'],
            'question': question['input_text'],
            'answer': row['answers'][question_index]['input_text'],
            'additional_answers': [row['additional_answers'][str(i)][question_index]['input_text'] for i in range(3)]
        }
        rows.append(new_row)
    return rows

expanded_rows = coqa_df.apply(expand_row, axis=1)
flattened_rows = [row for sublist in expanded_rows for row in sublist]
new_coqa_df = pd.DataFrame(flattened_rows)
new_coqa_df.head()

Unnamed: 0,id,story,question,answer,additional_answers
0,coqa_3dr23u6we5exclen4th8uq9rb42tel_0,"Once upon a time, in a barn near a farm house,...",What color was Cotton?,white,"[white, white, white]"
1,coqa_3dr23u6we5exclen4th8uq9rb42tel_1,"Once upon a time, in a barn near a farm house,...",Where did she live?,in a barn,"[in a barn, in a barn, in a barn near]"
2,coqa_3dr23u6we5exclen4th8uq9rb42tel_2,"Once upon a time, in a barn near a farm house,...",Did she live alone?,no,"[no, No, no]"
3,coqa_3dr23u6we5exclen4th8uq9rb42tel_3,"Once upon a time, in a barn near a farm house,...",Who did she live with?,with her mommy and 5 sisters,"[her mommy and 5 other sisters, her mommy and ..."
4,coqa_3dr23u6we5exclen4th8uq9rb42tel_4,"Once upon a time, in a barn near a farm house,...",What color were her sisters?,orange and white,"[orange with white tiger stripes, orange, orange]"


### Deduplicate Additional Answers

In [4]:
def deduplicate_additional_answers(row):
    # 从额外的答案中删除重复的答案
    additional_answers = set(row['additional_answers'])
    additional_answers.discard(row['answer'])
    return list(additional_answers)

new_coqa_df['additional_answers'] = new_coqa_df.apply(deduplicate_additional_answers, axis=1)
new_coqa_df.head()
    

Unnamed: 0,id,story,question,answer,additional_answers
0,coqa_3dr23u6we5exclen4th8uq9rb42tel_0,"Once upon a time, in a barn near a farm house,...",What color was Cotton?,white,[]
1,coqa_3dr23u6we5exclen4th8uq9rb42tel_1,"Once upon a time, in a barn near a farm house,...",Where did she live?,in a barn,[in a barn near]
2,coqa_3dr23u6we5exclen4th8uq9rb42tel_2,"Once upon a time, in a barn near a farm house,...",Did she live alone?,no,[No]
3,coqa_3dr23u6we5exclen4th8uq9rb42tel_3,"Once upon a time, in a barn near a farm house,...",Who did she live with?,with her mommy and 5 sisters,[her mommy and 5 other sisters]
4,coqa_3dr23u6we5exclen4th8uq9rb42tel_4,"Once upon a time, in a barn near a farm house,...",What color were her sisters?,orange and white,"[orange with white tiger stripes, orange]"


### Unify Column Names

In [6]:
def unify_column_names(row):
    # 将列名统一为小写
    return {
        'id': f"coqa_{row['id']}",
        'context': row['story'],
        'question': row['question'],
        'ground_truth': row['answer'],
        'gt_aliases': row['additional_answers']
    }

unified_coqa_df = new_coqa_df.apply(unify_column_names, axis=1, result_type='expand')
unified_coqa_df.head()

Unnamed: 0,id,context,question,ground_truth,gt_aliases
0,coqa_coqa_3dr23u6we5exclen4th8uq9rb42tel_0,"Once upon a time, in a barn near a farm house,...",What color was Cotton?,white,[]
1,coqa_coqa_3dr23u6we5exclen4th8uq9rb42tel_1,"Once upon a time, in a barn near a farm house,...",Where did she live?,in a barn,[in a barn near]
2,coqa_coqa_3dr23u6we5exclen4th8uq9rb42tel_2,"Once upon a time, in a barn near a farm house,...",Did she live alone?,no,[No]
3,coqa_coqa_3dr23u6we5exclen4th8uq9rb42tel_3,"Once upon a time, in a barn near a farm house,...",Who did she live with?,with her mommy and 5 sisters,[her mommy and 5 other sisters]
4,coqa_coqa_3dr23u6we5exclen4th8uq9rb42tel_4,"Once upon a time, in a barn near a farm house,...",What color were her sisters?,orange and white,"[orange with white tiger stripes, orange]"


### save 

In [7]:
!mkdir -p ./data/processed

In [8]:
unified_coqa_df.to_json('./data/processed/coqa.json', orient='records', lines=True)

## natural questions