In [1]:
#| default_exp 23_natural-questions-dataset

In [2]:
#| hide
from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [3]:
#| export
import os, json, pandas as pd, scipy.sparse as sp, numpy as np, argparse

from tqdm.auto import tqdm
from datasets import load_dataset
from dataclasses import dataclass
from huggingface_hub import snapshot_download

from sugar.core import *

# Download data

## `Kaggle`

### Load data

https://www.kaggle.com/datasets/yuitc2502/dpr-dataset

In [73]:
#| export
def load_kaggle_data(fname, key='positive_ctxs'):
    with open(fname) as file:
        content = json.load(file)

    queries, labels, lbl_id2idx = [], [], {}
    data, indices, indptr = [], [], [0]
    for o in tqdm(content):
        queries.append(o['question'])
        if key in o:
            for lbl in o[key]:
                lbl_id = lbl['id'] if 'id' in lbl else lbl['passage_id']
        
                if lbl_id in lbl_id2idx: 
                    idx = lbl_id2idx[lbl_id]
                else:
                    idx = len(lbl_id2idx)
                    lbl_id2idx[lbl_id] = len(lbl_id2idx)
                    labels.append(lbl['text'])
                data.append(1)
                indices.append(idx)
            indptr.append(len(indices))
    matrix = sp.csr_matrix((data, indices, indptr), dtype=np.float32)
    return queries, labels, lbl_id2idx, matrix
    

In [56]:
#| export
@dataclass
class QueryInfo:
    mat: sp.csr_matrix
    ids: list
    txt: list

    def sample_labels(self, lbl_idx:list):
        data_idx = np.where(self.mat.getnnz(axis=1) > 0)[0]
        
        self.mat = self.mat[:, lbl_idx][data_idx, :]
        self.ids = [self.ids[i] for i in data_idx]
        self.txt = [self.txt[i] for i in data_idx]

@dataclass
class LabelInfo:
    ids: list
    txt: list

    def sample(self, valid_idx:list):
        self.ids = [self.ids[i] for i in valid_idx]
        self.txt = [self.txt[i] for i in valid_idx]
    

In [57]:
#| export
def get_kaggle_dataset(fname, key='positive_ctxs'):
    qry_txt, lbl_txt, lbl_id2idx, qry_mat = load_kaggle_data(fname, key)

    qry_info = QueryInfo(qry_mat, qry_txt, qry_txt)
    
    lbl_ids = sorted(lbl_id2idx, key=lambda x: lbl_id2idx[x])
    lbl_info = LabelInfo(lbl_ids, lbl_txt)
    
    return qry_info, lbl_info
    

In [58]:
data_dir = '/home/scai/phd/aiz218323/scratch/datasets/nq/'
train_file = f'{data_dir}/nq-train.json'

In [46]:
queries, labels, lbl_id2idx, matrix = load_kaggle_data(train_file, key='positive_ctxs')

  0%|          | 0/4 [00:00<?, ?it/s]

In [59]:
qry_info, lbl_info = get_kaggle_dataset(train_file, key='positive_ctxs')

  0%|          | 0/4 [00:00<?, ?it/s]

In [75]:
matrix.sum_duplicates()

In [76]:
matrix.sort_indices()

In [67]:
with open(train_file) as file:
    train_data = json.load(file)

In [69]:
train_data[0]['positive_ctxs']

[{'title': 'Big Little Lies (TV series)',
  'text': 'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley',
  'score': 1000,
  'title_score': 1,
  'passage_id': '18768923'},
 {'id': '18768923',
  'title': 'Big Little Lies (TV series)',
  'text': 'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio al

### Save data

In [63]:
#| export
def save_dataset(save_dir, lbl_info, tst_info=None, trn_info=None, suffix=''):
    os.makedirs(save_dir, exist_ok=True)
    x_suffix = f'_{suffix}' if len(suffix) else ''

    if trn_info is not None: sp.save_npz(f'{save_dir}/trn_X_Y{x_suffix}.npz', trn_info.mat)
    if tst_info is not None: sp.save_npz(f'{save_dir}/tst_X_Y{x_suffix}.npz', tst_info.mat)
    
    os.makedirs(f'{save_dir}/raw_data', exist_ok=True)
    y_suffix = f'.{suffix}' if len(suffix) else ''
    if trn_info is not None: save_raw_file(f'{save_dir}/raw_data/train.raw.csv', trn_info.ids, trn_info.txt)
    if tst_info is not None: save_raw_file(f'{save_dir}/raw_data/test.raw.csv', tst_info.ids, tst_info.txt)
    save_raw_file(f'{save_dir}/raw_data/label{y_suffix}.raw.csv', lbl_info.ids, lbl_info.txt)
    

In [64]:
#| export
def get_and_save_kaggle_dataset(fname:str, key='positive_ctxs', save_dir:str=None, suffix=''):
    qry_info, lbl_info = get_kaggle_dataset(fname, key=key)
    if save_dir is not None: save_dataset(save_dir, lbl_info, trn_info=qry_info, suffix=suffix)
    return qry_info, lbl_info
    

In [70]:
#| export
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--fname', type=str, required=True)
    parser.add_argument('--key', type=str, default='positive_ctxs')
    parser.add_argument('--save_dir', type=str, default=None)
    parser.add_argument('--suffix', type=str, default='')
    
    return parser.parse_args()
    

In [None]:
#| export
if __name__ == '__main__':
    args = parse_args()
    get_and_save_kaggle_dataset(args.fname, key=args.key, save_dir=args.save_dir, suffix=args.suffix)
                          

In [66]:
fname = '/home/scai/phd/aiz218323/scratch/datasets/nq/nq-train.json'
save_dir = '/home/scai/phd/aiz218323/scratch/datasets/nq/XC'

trn_info, lbl_info = get_and_save_kaggle_dataset(fname, save_dir=save_dir, suffix='kaggle')

  0%|          | 0/4 [00:00<?, ?it/s]