In [130]:
import pandas as pd
from collections import defaultdict
import os

In [1]:
# read in NQ train
# there is only ever 0 or 1 long_answers (I validated) which is the gold passage

In [None]:
# - Remove queries without relevant passage or where relevant passages map to tables etc
# - Avoid two queries that map to the same Wikipedia page (based on the title) - Only use the first query that maps to a Wikipedia page.

In [3]:
nq_raw = pd.read_json("nq_raw/train.jsonl", lines=True)

In [72]:
# filter out all entries with no answer
bool_long_answers = nq_raw["long_answers"].map(bool)
nq_answer_df = nq_raw[bool_long_answers.values]

In [73]:
removed = len(nq_raw) - len(nq_answer_df)
print(f"Original: {len(nq_raw)}")
print(f"Num queries removed: {removed} ({removed/len(nq_raw)*100})%")

Original: 307373
Num queries removed: 174570 (56.794188168772145)%


In [74]:
nq_answer = nq_answer_df.T.to_dict()

In [117]:
table_answers = []
corpus2id, queries2id = {}, {}
qrels, corpus = [], []
#count_list, count_text = 0, 0
title2query = {} # for deduplication of queries that reference the same wiki article
dup_titles_entries = []
query2duplicate = defaultdict(list)

In [118]:
for _, data in nq_answer.items():
    query, passage_types = data["question"], data["passage_types"]
    gold_idx = data["long_answers"][0]
    gold_id_type = passage_types[gold_idx]
    # verification/cleaning checks
    if gold_id_type == "table":
        table_answers.append(data)
        continue
    if data["title"] in title2query:
        dup_titles_entries.append(data)
        query2duplicate[title2query[data["title"]]].append(query)
        continue
    title2query[data["title"]] = query
    gold_id_text = data["candidates"][gold_idx]
    if query not in queries2id:
        queries2id[query] = f"train{len(queries2id)}"
    for i, text in enumerate(data["candidates"]):
        if passage_types[i] == "table":
            continue
        if text not in corpus2id:
            this_id = f"doc{len(corpus2id)}"
            corpus2id[text] = this_id
            corpus.append((this_id, data["title"], text))
    new_qrel = (queries2id[query], corpus2id[gold_id_text], 1)
    qrels.append(new_qrel)

In [91]:
print(qrels[-4:])


[('train110262', 'doc1736505', 1), ('train110263', 'doc1107361', 1), ('train110264', 'doc3348650', 1), ('train110265', 'doc352737', 1)]


In [119]:
# preserved post filtering
len(corpus2id), len(corpus), len(qrels), len(queries2id)

(2692478, 2692478, 50142, 50142)

In [120]:
# filtered out
len(dup_titles_entries), len(table_answers)

(60124, 22537)

In [127]:
list(query2duplicate.items())[5:10]

[('where did the bald eagle get its name',
  ['what is the national bird of the united states',
   'what kind of bird is a bald eagle',
   'when do bald eagles get their distinctive white head and tail feathers',
   'where do bald eagles live in the united states',
   'where do bald eagles live in the world',
   'where are bald eagles found in the world',
   'where do bald eagles live in the us',
   'how many years can a bald eagle live',
   'is the bald eagle a predator or prey']),
 ('who played the crooked man in the conjuring 2',
  ['who played the nun in the conjuring two',
   'does conjuring 2 take place before conjuring 1',
   'what is the name of the nun in conjuring 2']),
 ('who plays little sweet on dr pepper commercial',
  ['who plays lil sweet on the dr pepper commercial',
   'who played lil sweet in the diet dr pepper commercial',
   'who is in the diet dr. pepper commercial',
   'who did kelly clarkson beat on american idol',
   'who is the singer in diet dr pepper commerc

In [129]:
# save qrels, corpus, queries, and print stats
qrels_cols, queries_cols, corpus_cols = ["query-id", "corpus-id", "score"], ["_id", "text"], ["_id", "title", "text"]
qrels_df = pd.DataFrame(data=qrels, columns=qrels_cols)
corpus_df = pd.DataFrame(data=corpus, columns=corpus_cols)
queries_df = pd.DataFrame(data=[(i, text) for text, i in queries2id.items()], columns=queries_cols)


In [136]:
qrels_df

Unnamed: 0,query-id,corpus-id,score
0,train0,doc49,1
1,train1,doc482,1
2,train2,doc515,1
3,train3,doc556,1
4,train4,doc582,1
...,...,...,...
50137,train50137,doc2692198,1
50138,train50138,doc2692203,1
50139,train50139,doc2692231,1
50140,train50140,doc2692395,1


In [138]:
base = "beir/nq-train-new/"
os.makedirs(base+"qrels", exist_ok=True)
qrels_df.to_csv(base+"qrels/test.tsv", sep='\t', columns=qrels_cols, index=False)
corpus_df.to_json(base+"corpus.jsonl", lines=True, orient="records")
queries_df.to_json(base+"queries.jsonl", lines=True, orient="records")

In [18]:
y["question"].values

array(['who elects the senators and house of representatives'],
      dtype=object)

In [20]:
for i, j in zip(y["candidates"].values[0], y["passage_types"].values[0]):
    if j == "text":
        print(i)

535 voting members
The United States Congress is the bicameral legislature of the federal government of the United States consisting of two chambers: the Senate and the House of Representatives.
The Congress meets in the Capitol in Washington, D.C. Both senators and representatives are chosen through direct election, though vacancies in the Senate may be filled by a gubernatorial appointment. Congress has 535 voting members: 435 Representatives and 100 Senators. The House of Representatives has six non-voting members in addition to its 435 voting members. These members can, however, sit on congressional committees and introduce legislation. These members represent Puerto Rico, American Samoa, Guam, the Northern Mariana Islands, the U.S. Virgin Islands, and Washington, D.C.
The members of the House of Representatives serve two-year terms representing the people of a single constituency, known as a "district". Congressional districts are apportioned to states by population using the Unit

In [38]:
bool_long_answers = nq_raw["long_answers"].map(bool)

In [41]:
nq_raw[bool_long_answers.values]

Unnamed: 0,question,title,candidates,passage_types,long_answers
1,when did richmond last play in a preliminary f...,Richmond Football Club,[Richmond Football Club\n\n\n\n\n\nNames\n\n\n...,"[table, table, table, table, table, table, tab...",[77]
2,who sang what in the world's come over you,Jack Scott (singer),[Jack Scott\n\n\nBirth name\nGiovanni Domenico...,"[table, table, table, table, table, table, tab...",[14]
3,who produces the most wool in the world,Wool,[The examples and perspective in this article ...,"[table, table, text, text, text, text, text, t...",[58]
4,where does alaska the last frontier take place,Alaska: The Last Frontier,[Alaska: The Last Frontier\nTitle card (Season...,"[table, table, table, table, table, table, tab...",[23]
6,a day to remember all i want cameos,All I Want (A Day to Remember song),"[""All I Want""\n\n\n\nIllustration by Mike C. H...","[table, table, table, table, table, table, tab...",[35]
...,...,...,...,...,...
307362,when does new dragon ball super episode air,List of Dragon Ball Super episodes,[Dragon Ball Super is a Japanese anime televis...,"[text, text, text, text, table, table, table, ...",[4]
307363,who plays penny's father on big bang,Keith Carradine,[Keith Carradine\n\n\n\nCarradine at the 2006 ...,"[table, table, table, table, table, table, tab...",[23]
307365,who played lulu hogg on dukes of hazzard,Peggy Rea,[Peggy Rea\n\n\nBorn\nPeggy Jane Rea\n(1921-03...,"[table, table, table, table, table, table, tex...",[9]
307368,when did india last win the world cup in hockey,Hockey World Cup,"[Men's Hockey World Cup\n\nCurrent season, com...","[table, table, table, table, table, table, tab...",[22]
