In [1]:
import os
import re
import json
import numpy as np
import random
from sklearn.model_selection import train_test_split, KFold

SEED = 42


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)


set_seed(SEED)

In [2]:
dataset='6b'
reqa_json=f'../../../data/BioASQ/{dataset}/train.json'
reqa_test_json=f'../../../data/BioASQ/{dataset}/test.json'
output_dir=f'../../../data/bioasq_formatted/{dataset}'

In [3]:
with open(reqa_json) as f:
    data=json.load(f)
data.keys()

dict_keys(['questions', 'answers', 'question_ids'])

In [4]:
for key in data:
    for i in range(len(data[key])):
        if isinstance(data[key][i],str):
            data[key][i]=data[key][i].replace('\r','').replace('\n',' ')

In [5]:
print('size of q-a pairs:',len(data['questions']))
print(f'size of unique answers:',len(set(data['answers'])))
print(f'size of unique questions:',len(set(data['questions'])))

size of q-a pairs: 3093
size of unique answers: 3071
size of unique questions: 2249


In [6]:
set_seed(SEED)
kf = KFold(n_splits=5)
data_arr = list(zip(data['questions'], data['answers']))
random.shuffle(data_arr)
fold = 1
for train_ids, dev_ids in kf.split(data_arr):
    train = [data_arr[i] for i in train_ids]
    dev = [data_arr[i] for i in dev_ids]
    print('train size', len(train), 'dev size', len(dev))
    train = {
        'questions': [x[0] for x in train],
        'answers': [x[1] for x in train],
    }
    dev = {
        'questions': [x[0] for x in dev],
        'answers': [x[1] for x in dev],
    }
    conf = {'train': train, 'dev': dev}
    
    fold_output_dir = os.path.join(output_dir, f'fold_{fold}')
    os.makedirs(fold_output_dir, exist_ok=True)
    for name, data in conf.items():
        # 处理答案
        answer_dict = {}
        unique_answers = []
        with open(os.path.join(fold_output_dir, f'{name}_corpus.tsv'),
                  'w') as f:
            # 答案语料
            for answer in data['answers']:
                if answer in answer_dict:
                    continue
                idx = len(answer_dict)
                unique_answers.append(answer)
                answer_dict[answer] = idx
            print(f'[fold: {fold}][{name}] size of unique answers:',
                  len(unique_answers))
            for idx, answer in enumerate(unique_answers):
                f.write(f'{idx}\t-\t{answer}\n')
            # 问题
            question_dict = {}
            unique_questions = []
            with open(os.path.join(fold_output_dir, f'{name}_query.txt'),
                      'w') as f:
                for q in data['questions']:
                    if q in question_dict:
                        continue
                    idx = len(question_dict)
                    unique_questions.append(q)
                    question_dict[q] = idx
                print(f'[fold: {fold}][{name}] size of unique questions:',
                      len(unique_questions))
                for idx, q in enumerate(unique_questions):
                    f.write(f'{idx}\t{q}\n')
            # rels 里的格式是一行一对，一对多的分多行，参考msmacro dev里的 178627
            with open(os.path.join(fold_output_dir, f'qrels_{name}.tsv'),
                      'w') as f:
                for q, a in zip(data['questions'], data['answers']):
                    q_id = question_dict[q]
                    a_id = answer_dict[a]
                    f.write(f'{q_id}\t{a_id}\n')
                print(f'[fold: {fold}][{name}] size of q-a pairs:',
                      len(data['questions']))
    fold = fold + 1

train size 2474 dev size 619
[fold: 1][train] size of unique answers: 2457
[fold: 1][train] size of unique questions: 1883
[fold: 1][train] size of q-a pairs: 2474
[fold: 1][dev] size of unique answers: 618
[fold: 1][dev] size of unique questions: 556
[fold: 1][dev] size of q-a pairs: 619
train size 2474 dev size 619
[fold: 2][train] size of unique answers: 2458
[fold: 2][train] size of unique questions: 1878
[fold: 2][train] size of q-a pairs: 2474
[fold: 2][dev] size of unique answers: 618
[fold: 2][dev] size of unique questions: 557
[fold: 2][dev] size of q-a pairs: 619
train size 2474 dev size 619
[fold: 3][train] size of unique answers: 2458
[fold: 3][train] size of unique questions: 1872
[fold: 3][train] size of q-a pairs: 2474
[fold: 3][dev] size of unique answers: 617
[fold: 3][dev] size of unique questions: 564
[fold: 3][dev] size of q-a pairs: 619
train size 2475 dev size 618
[fold: 4][train] size of unique answers: 2456
[fold: 4][train] size of unique questions: 1872
[fold: 

## 接下来处理测试集，注意变量跟上面是重名的，为了方便

In [7]:
with open(reqa_test_json) as f:
    data=json.load(f)
data.keys()

dict_keys(['questions', 'candidates', 'ground_truths'])

In [8]:
for key in data:
    for i in range(len(data[key])):
        if isinstance(data[key][i],str):
            data[key][i]=data[key][i].replace('\r','').replace('\n',' ')

In [9]:
answer_dict={}
unique_answers=[]
with open(os.path.join(output_dir,'test_corpus.tsv'),'w') as f:
    for answer in data['candidates']:
        if answer in answer_dict:
            continue
        idx=len(answer_dict)
        unique_answers.append(answer)
        answer_dict[answer]=idx
    print('size of unique answers:',len(unique_answers))
    for idx,answer in enumerate(unique_answers):
        f.write(f'{idx}\t-\t{answer}\n')

size of unique answers: 26176


In [10]:
question_dict={}
unique_questions=[]
with open(os.path.join(output_dir,'test_query.txt'),'w') as f:
    for q in data['questions']:
        if q in question_dict:
            continue
        idx=len(question_dict)
        unique_questions.append(q)
        question_dict[q]=idx
    print('size of unique questions:',len(unique_questions))
    for idx,q in enumerate(unique_questions):
        f.write(f'{idx}\t{q}\n')

size of unique questions: 500


In [11]:
# rels 里的格式是一行一对，一对多的分多行，参考msmacro dev里的 178627
used_answers=[]
with open(os.path.join(output_dir,'qrels_test.tsv'),'w') as f:
    cnt=0
    for q,ground_truths in zip(data['questions'],data['ground_truths']):
        for c_id in ground_truths:
            a=data['candidates'][c_id]
            used_answers.append(c_id)
            q_id=question_dict[q]
            a_id=answer_dict[a]
            f.write(f'{q_id}\t{a_id}\n')
            cnt+=1
    print('size of q-a pairs:',cnt)
    print('size of used answers:',len(set(used_answers)))


size of q-a pairs: 897
size of used answers: 887
