In [10]:
from datasets import load_dataset
import json, os
from tqdm.notebook import tqdm
dataset="dwzhu/LongEmbed"
tests = ['narrativeqa', 'summ_screen_fd', 'qmsum', '2wikimqa', 'passkey', 'needle']
dataset_name=""

In [8]:
def mkdir_if_not_there(outdir):
    os.makedirs(outdir, exist_ok=True)


In [14]:
def process_dataset_part(dataset, part, outdir):
    data = load_dataset(path=dataset, name=part)
    # Save the corpus
    mkdir_if_not_there(outdir)
    with open(os.path.join(outdir, "corpus.jsonl"), "w") as f:
        for d in tqdm(data['corpus'], desc=f"Processing corpus"):
            f.write(json.dumps({'id': d['doc_id'], 'text': d['text'], 'qid': d['qid']}) + "\n")
    with open(os.path.join(outdir, "queries.jsonl"), "w") as f:
        for d in tqdm(data['queries'], desc="Saving queries: "):
            f.write(json.dumps({'id': d['qid'], 'text': d['text']}) + "\n")
    mkdir_if_not_there(os.path.join(outdir, "qrels"))
    with open(os.path.join(outdir, "test.tsv"), "w") as f:
        f.write("\t".join(["query-id", "corpus-id", "score"])+"\n")
        for d in tqdm(data['qrels'], desc="Saving qrels: "):
            f.write("\t".join([d['qid'], d['doc_id'], "1"]) + "\n")

In [15]:
outputdir = "../benchmark/longembed"
for test in tests:
    print(f"Processing {test}")
    process_dataset_part(dataset, test, f"{outputdir}_{test}")

Processing narrativeqa


Processing corpus:   0%|          | 0/355 [00:00<?, ?it/s]

Saving queries:   0%|          | 0/10449 [00:00<?, ?it/s]

Saving qrels:   0%|          | 0/10449 [00:00<?, ?it/s]

Processing summ_screen_fd


corpus.jsonl:   0%|          | 0.00/10.5M [00:00<?, ?B/s]

queries.jsonl: 0.00B [00:00, ?B/s]

qrels.jsonl:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/336 [00:00<?, ? examples/s]

Generating queries split:   0%|          | 0/336 [00:00<?, ? examples/s]

Generating qrels split:   0%|          | 0/336 [00:00<?, ? examples/s]

Processing corpus:   0%|          | 0/336 [00:00<?, ?it/s]

Saving queries:   0%|          | 0/336 [00:00<?, ?it/s]

Saving qrels:   0%|          | 0/336 [00:00<?, ?it/s]

Processing qmsum


corpus.jsonl:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

queries.jsonl: 0.00B [00:00, ?B/s]

qrels.jsonl:   0%|          | 0.00/82.0k [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/197 [00:00<?, ? examples/s]

Generating queries split:   0%|          | 0/1527 [00:00<?, ? examples/s]

Generating qrels split:   0%|          | 0/1527 [00:00<?, ? examples/s]

Processing corpus:   0%|          | 0/197 [00:00<?, ?it/s]

Saving queries:   0%|          | 0/1527 [00:00<?, ?it/s]

Saving qrels:   0%|          | 0/1527 [00:00<?, ?it/s]

Processing 2wikimqa


corpus.jsonl:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

queries.jsonl: 0.00B [00:00, ?B/s]

qrels.jsonl:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/300 [00:00<?, ? examples/s]

Generating queries split:   0%|          | 0/300 [00:00<?, ? examples/s]

Generating qrels split:   0%|          | 0/300 [00:00<?, ? examples/s]

Processing corpus:   0%|          | 0/300 [00:00<?, ?it/s]

Saving queries:   0%|          | 0/300 [00:00<?, ?it/s]

Saving qrels:   0%|          | 0/300 [00:00<?, ?it/s]

Processing passkey


corpus.jsonl:   0%|          | 0.00/23.2M [00:00<?, ?B/s]

queries.jsonl: 0.00B [00:00, ?B/s]

qrels.jsonl:   0%|          | 0.00/35.9k [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/800 [00:00<?, ? examples/s]

Generating queries split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating qrels split:   0%|          | 0/400 [00:00<?, ? examples/s]

Processing corpus:   0%|          | 0/800 [00:00<?, ?it/s]

Saving queries:   0%|          | 0/400 [00:00<?, ?it/s]

Saving qrels:   0%|          | 0/400 [00:00<?, ?it/s]

Processing needle


corpus.jsonl:   0%|          | 0.00/28.3M [00:00<?, ?B/s]

queries.jsonl: 0.00B [00:00, ?B/s]

qrels.jsonl:   0%|          | 0.00/35.9k [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/800 [00:00<?, ? examples/s]

Generating queries split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating qrels split:   0%|          | 0/400 [00:00<?, ? examples/s]

Processing corpus:   0%|          | 0/800 [00:00<?, ?it/s]

Saving queries:   0%|          | 0/400 [00:00<?, ?it/s]

Saving qrels:   0%|          | 0/400 [00:00<?, ?it/s]

In [17]:
max([len(d['text']) for d in corpus])

382998

In [3]:
import pandas as pd
question_file= "../benchmark/mldr/may5_8_20_progress_1.jsonl"
out_file = question_file.replace(".jsonl", "fixed.jsonl")
questions = pd.read_json(question_file, lines=True)
relevant = []
for id in questions['docid']:
    pos = id.rfind("-")
    relevant.append([id[:pos]])
questions.insert(2, "relevant", relevant)
questions.to_json(out_file, orient="records", lines=True)

In [None]:
train=load_dataset('Shitao/MLDR', 'en', split='train')
corpusid={d['docid']:i for i, d in enumerate(corpus)}
trainid={d['docid']:(i,j) for i, e in enumerate(train) for j, d in enumerate(e['positive_passages'])}
with open("benchmark/mldr/train_corpus.jsonl", "w") as out:
    for t in train:
        for p in t['positive_passages']:
            out.write(json.dumps(p))
            out.write("\n")

In [6]:
from datasets import load_dataset
import json
devel=load_dataset('Shitao/MLDR', 'en', split='dev')
devid={d['docid']:(i,j) for i, e in enumerate(devel) for j, d in enumerate(e['positive_passages'])}

In [17]:
len(devid)-len(set(devid.keys()).intersection(corpusid.keys()))

0

In [19]:
from datasets import load_dataset
import json
test=load_dataset('Shitao/MLDR', 'en', split='test')
testid={d['docid']:(i,j) for i, e in enumerate(test) for j, d in enumerate(e['positive_passages'])}

In [None]:
len(testid)-len(set(testid.keys()).intersection(corpusid.keys()))

In [20]:
len(test)

800