In [2]:
import json

def read_jsonl(filename: str, encoding="utf-8"):
    with open(filename, mode="r", encoding=encoding) as fp:
        content = [json.loads(line.rstrip("\n").strip()) for line in fp]

    return content

def dump_jsonl(filename: str, data, encoding="utf-8"):
    with open(filename, mode="w", encoding=encoding) as fp:
        for line in data:
            json.dump(line, fp)
            fp.write("\n")

In [3]:
# load the mt data and merge the selected sentences into the text spot.

import glob 

mt_data = "/dccstor/jlquinn-mt/qa/longqa/v3/*.jsonl"

mt_files = glob.glob(mt_data)

print(mt_files)

['/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_train_unanswerable.jsonl.tgt.enpt.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_dev_unanswerable.jsonl.tgt.enpt.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_dev_answerable.jsonl.tgt.ende.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_train_answerable_bool.jsonl.tgt.enfr.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_train_answerable_bool.jsonl.tgt.enpt.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_dev_unanswerable.jsonl.tgt.ende.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_test_unanswerable.jsonl.tgt.enpt.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_test_answerable.jsonl.tgt.enfr.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_test_unanswerable.jsonl.tgt.enja.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_train_unanswerable.jsonl.tgt.enes.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_dev_answerable.jsonl.tgt.enja.jsonl', '/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_train_answerable_bool.jsonl.tgt.enja.jsonl', '/dccstor/j

In [8]:
import os

for mt_file in mt_files:
    print(mt_file)
    language = mt_file[-8:-6]
    split = mt_file[mt_file.index("_")+1:mt_file.index("_", mt_file.index("_")+1)]
    answerable = "unanswerable" if "unanswerable" in mt_file else "answerable"
    print(answerable)
    os.makedirs(f"/dccstor/srosent3/long_nq_multilingual/annotated_data/{language}/{split}/",exist_ok=True)
    fname = f"/dccstor/srosent3/long_nq_multilingual/annotated_data/{language}/{split}/longNQ_{split}_{answerable}.jsonl"

    examples = read_jsonl(mt_file)
    for example in examples:
        example["passages"][0]["text"] = " ".join(example["passages"][0]["sentences"])

    dump_jsonl(fname, examples)

/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_train_unanswerable.jsonl.tgt.enpt.jsonl
unanswerable
/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_dev_unanswerable.jsonl.tgt.enpt.jsonl
unanswerable
/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_dev_answerable.jsonl.tgt.ende.jsonl
answerable
/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_train_answerable_bool.jsonl.tgt.enfr.jsonl
answerable
/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_train_answerable_bool.jsonl.tgt.enpt.jsonl
answerable
/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_dev_unanswerable.jsonl.tgt.ende.jsonl
unanswerable
/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_test_unanswerable.jsonl.tgt.enpt.jsonl
unanswerable
/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_test_answerable.jsonl.tgt.enfr.jsonl
answerable
/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_test_unanswerable.jsonl.tgt.enja.jsonl
unanswerable
/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_train_unanswerable.jsonl.tgt.enes.jsonl
unanswerable
/dccstor/jlquinn-mt/qa/longqa/v3/longNQ_dev_answerable.jsonl.tgt.enja.jsonl
answerabl

In [9]:
# multilingual passages and get training questions in tsv format with matching passage ids
import pandas as pd
import shutil
import os

# load multilingual json, load english tsv -> match question ids -> copy passage ids 
splits = ['train','dev','test']
answerables = ['answerable', 'unanswerable']
languages = ['es', 'de', 'fr', 'pt', 'ja']

for split in splits:
    questions = []
    for answerable in answerables:
        en_data = pd.read_csv(f"/dccstor/srosent2/generative/appen/final/longNQ/passages_for_index/{split}/question_{split}_{answerable}.tsv", delimiter="\t", dtype={'id':str})
        
        for language in languages:
            print(f"{split} {answerable} {language}")
            mt_data = read_jsonl(f"/dccstor/srosent3/long_nq_multilingual/annotated_data/{language}/{split}/longNQ_{split}_{answerable}.jsonl")
            
            for example in mt_data:
                answers = []
                for answer in example['output']:
                    answers.append(answer['answer'])
                doc_ids = en_data[en_data['id'] == example['id']].iloc[0]['doc-id-list']
                questions.append([example['id'], example['input'], doc_ids, "::".join(answers)])
            os.makedirs(f"/dccstor/srosent3/long_nq_multilingual/retrieval/{language}/{split}/",exist_ok=True)
            pd.DataFrame(questions).to_csv(f"/dccstor/srosent3/long_nq_multilingual/retrieval/{language}/{split}/question_{split}_{answerable}.tsv", index=False, sep='\t')
            shutil.copyfile(f"/dccstor/jlquinn-mt/qa/longqa/v2/passages.en{language}.tsv",f"/dccstor/srosent3/long_nq_multilingual/retrieval/{language}/passages.tsv")

train answerable es
train answerable de
train answerable fr
train answerable pt
train answerable ja
train unanswerable es
train unanswerable de
train unanswerable fr
train unanswerable pt
train unanswerable ja
dev answerable es
dev answerable de
dev answerable fr
dev answerable pt
dev answerable ja
dev unanswerable es
dev unanswerable de
dev unanswerable fr
dev unanswerable pt
dev unanswerable ja
test answerable es
test answerable de
test answerable fr
test answerable pt
test answerable ja
test unanswerable es
test unanswerable de
test unanswerable fr
test unanswerable pt
test unanswerable ja
