In [14]:
#| default_exp 20_msmarco-hard-negatives

In [15]:
%reload_ext autoreload
%autoreload 2

In [35]:
#| export
import pickle, scipy.sparse as sp, numpy as np, argparse, os
from tqdm.auto import tqdm
from typing import Optional, List

from xcai.main import *
from sugar.core import *

## Setup

In [23]:
#| export
def load_msmarco_hard_negatives(fname:str, data_ids:Optional[List]=None):
    with open(fname, 'rb') as file:
        negatives = pickle.load(file)

    data_ids = list(negatives) if data_ids is None else data_ids

    lbl_id2idx = dict()
    data, indices, indptr = [], [], [0]
    for idx in tqdm(data_ids):
        if idx in negatives:
            data.extend(list(negatives[idx].values()))
            for i in negatives[idx]:
                index = lbl_id2idx.setdefault(i, len(lbl_id2idx))
                indices.append(index)
        indptr.append(len(data))

    lbl_ids = sorted(lbl_id2idx, key=lambda x: lbl_id2idx[x])
    return data_ids, lbl_ids, sp.csr_matrix((data, indices, indptr), dtype=np.float32)
    

In [24]:
pkl_dir = '/scratch/scai/phd/aiz218323/datasets/processed/'

config_file = '/scratch/scai/phd/aiz218323/datasets/msmarco/XC/configs/entity_gpt_exact.json'
config_key = 'data_entity-gpt_exact'

use_sxc_sampler = True

pkl_file = f'{pkl_dir}/mogicX/msmarco_data-meta_distilbert-base-uncased_sxc.joblib'

os.makedirs(os.path.dirname(pkl_file), exist_ok=True)
block = build_block(pkl_file, config_file, use_sxc_sampler, config_key, do_build=False, only_test=False)

In [8]:
data_dir = "/home/scai/phd/aiz218323/scratch/datasets/msmarco/negatives"
fname = f"{data_dir}/cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl"

In [25]:
trn_ids = [int(i) for i in block.train.dset.data.data_info['identifier']]
tst_ids = [int(i) for i in block.test.dset.data.data_info['identifier']]

In [27]:
data_ids, neg_ids, data_neg = load_msmarco_hard_negatives(fname, trn_ids)
lbl_neg = sp.csr_matrix((block.n_lbl, data_neg.shape[1]), dtype=np.float32)

  0%|          | 0/502939 [00:00<?, ?it/s]

In [28]:
sp.save_npz(f'{data_dir}/negatives_trn_X_Y.npz', data_neg)
sp.save_npz(f'{data_dir}/negatives_lbl_X_Y_exact.npz', lbl_neg)

In [29]:
fname = '/home/scai/phd/aiz218323/scratch/datasets/msmarco/XC/raw_data/label.raw.txt'
lbl_ids, lbl_txt = load_raw_file(fname)
lbl_map = {k:v for k,v in zip(lbl_ids, lbl_txt)}

In [30]:
neg_txt = [lbl_map[str(i)] for i in neg_ids]

In [31]:
save_raw_file(f'{data_dir}/raw_data/negatives.raw.txt', neg_ids, neg_txt)

## Driver

In [34]:
#| export
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--pkl_dir', type=str, required=True)
    parser.add_argument('--data_dir', type=str, required=True)
    return parser.parse_args()
    

In [None]:
#| export
if __name__ == '__main__':
    args = parse_args()
    
    config_file = f'{args.data_dir}/XC/configs/entity_gpt_exact.json'
    config_key = 'data_entity-gpt_exact'
    
    use_sxc_sampler = True
    pkl_file = f'{args.pkl_dir}/mogicX/msmarco_data-meta_distilbert-base-uncased_sxc.joblib'
    os.makedirs(os.path.dirname(pkl_file), exist_ok=True)
    block = build_block(pkl_file, config_file, use_sxc_sampler, config_key, do_build=False, only_test=False)

    neg_file = f"{args.data_dir}/negatives/cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl"

    trn_ids = [int(i) for i in block.train.dset.data.data_info['identifier']]
    data_ids, neg_ids, data_neg = load_msmarco_hard_negatives(neg_file, trn_ids)
    lbl_neg = sp.csr_matrix((block.n_lbl, data_neg.shape[1]), dtype=np.float32)
    
    sp.save_npz(f'{args.data_dir}/XC/negatives_trn_X_Y.npz', data_neg)
    sp.save_npz(f'{args.data_dir}/XC/negatives_lbl_X_Y_exact.npz', lbl_neg)

    lbl_file = f'{args.data_dir}/XC/raw_data/label.raw.txt'
    lbl_ids, lbl_txt = load_raw_file(lbl_file)
    lbl_map = {k:v for k,v in zip(lbl_ids, lbl_txt)}

    neg_txt = [lbl_map[str(i)] for i in neg_ids]
    save_raw_file(f'{args.data_dir}/XC/raw_data/negatives.raw.txt', neg_ids, neg_txt)
    