In [2]:
import os
import re
import json
import numpy as np
import random
from sklearn.model_selection import train_test_split

SEED=42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
set_seed(SEED)

In [12]:
reqa_json='../../../data/SQuAD/train.json'
reqa_test_json='../../../data/SQuAD/test.json'
output_dir='../../../data/squad_formatted/'

In [4]:
with open(reqa_json) as f:
    data=json.load(f)
data.keys()

dict_keys(['questions', 'question_ids', 'answers'])

In [5]:
for key in data:
    for i in range(len(data[key])):
        if isinstance(data[key][i],str):
            data[key][i]=data[key][i].replace('\n',' ')

In [6]:
print('size of q-a pairs:',len(data['questions']))
print(f'size of unique answers:',len(set(data['answers'])))
print(f'size of unique questions:',len(set(data['questions'])))

size of q-a pairs: 87599
size of unique answers: 58934
size of unique questions: 87355


In [7]:
set_seed(SEED)
data_arr=list(zip(data['questions'],data['answers']))
random.shuffle(data_arr)
train,dev=train_test_split(data_arr,test_size=0.1,shuffle=False)
print('train size',len(train),'dev size',len(dev))
train={
    'questions':[x[0] for x in train],
    'answers':[x[1] for x in train],
}
dev={
    'questions':[x[0] for x in dev],
    'answers':[x[1] for x in dev],
}


train size 78839 dev size 8760


In [13]:
conf={
    'train':train,
    'dev':dev
}
for name,data in conf.items():
    # 处理答案
    answer_dict={}
    unique_answers=[]
    with open(os.path.join(output_dir,f'{name}_corpus.tsv'),'w') as f:
        # 答案语料
        for answer in data['answers']:
            if answer in answer_dict:
                continue
            idx=len(answer_dict)
            unique_answers.append(answer)
            answer_dict[answer]=idx
        print(f'[{name}] size of unique answers:',len(unique_answers))
        for idx,answer in enumerate(unique_answers):
            f.write(f'{idx}\t-\t{answer}\n')
        # 问题
        question_dict={}
        unique_questions=[]
        with open(os.path.join(output_dir,f'{name}_query.txt'),'w') as f:
            for q in data['questions']:
                if q in question_dict:
                    continue
                idx=len(question_dict)
                unique_questions.append(q)
                question_dict[q]=idx
            print(f'[{name}] size of unique questions:',len(unique_questions))
            for idx,q in enumerate(unique_questions):
                f.write(f'{idx}\t{q}\n')
        # rels 里的格式是一行一对，一对多的分多行，参考msmacro dev里的 178627
        with open(os.path.join(output_dir,f'qrels_{name}.tsv'),'w') as f:
            for q,a in zip(data['questions'],data['answers']):
                q_id=question_dict[q]
                a_id=answer_dict[a]
                f.write(f'{q_id}\t{a_id}\n')
            print(f'[{name}] size of q-a pairs:',len(data['questions']))

[train] size of unique answers: 54970
[train] size of unique questions: 78633
[train] size of q-a pairs: 78839
[dev] size of unique answers: 8351
[dev] size of unique questions: 8755
[dev] size of q-a pairs: 8760


## 接下来处理测试集，注意变量跟上面是重名的，为了方便

In [14]:
with open(reqa_test_json) as f:
    data=json.load(f)
data.keys()

dict_keys(['questions', 'candidates', 'ground_truths'])

In [15]:
for key in data:
    for i in range(len(data[key])):
        if isinstance(data[key][i],str):
            data[key][i]=data[key][i].replace('\n',' ')

In [16]:
answer_dict={}
unique_answers=[]
with open(os.path.join(output_dir,'test_corpus.tsv'),'w') as f:
    for answer in data['candidates']:
        if answer in answer_dict:
            continue
        idx=len(answer_dict)
        unique_answers.append(answer)
        answer_dict[answer]=idx
    print('size of unique answers:',len(unique_answers))
    for idx,answer in enumerate(unique_answers):
        f.write(f'{idx}\t-\t{answer}\n')

size of unique answers: 10246


In [17]:
question_dict={}
unique_questions=[]
with open(os.path.join(output_dir,'test_query.txt'),'w') as f:
    for q in data['questions']:
        if q in question_dict:
            continue
        idx=len(question_dict)
        unique_questions.append(q)
        question_dict[q]=idx
    print('size of unique questions:',len(unique_questions))
    for idx,q in enumerate(unique_questions):
        f.write(f'{idx}\t{q}\n')

size of unique questions: 10539


In [18]:
# rels 里的格式是一行一对，一对多的分多行，参考msmacro dev里的 178627
used_answers=[]
with open(os.path.join(output_dir,'qrels_test.tsv'),'w') as f:
    cnt=0
    for q,ground_truths in zip(data['questions'],data['ground_truths']):
        for c_id in ground_truths:
            a=data['candidates'][c_id]
            used_answers.append(c_id)
            q_id=question_dict[q]
            a_id=answer_dict[a]
            f.write(f'{q_id}\t{a_id}\n')
            cnt+=1
    print('size of q-a pairs:',cnt)
    print('size of used answers:',len(set(used_answers)))


size of q-a pairs: 11396
size of used answers: 7087
