In [3]:
#| default_exp 22_amazon-gpt-category-overlap-with-test-data

In [4]:
%reload_ext autoreload
%autoreload 2

In [23]:
import torch.nn.functional as F, math, scipy.sparse as sp, os, numpy as np
from torch.utils.data import DataLoader, Dataset
from xclib.utils.sparse import retain_topk
from tqdm.auto import tqdm

In [12]:
from xcai.main import *
from xcai.analysis import *
from xcai.data import XCDataset
from xcai.analysis import *
from xcai.models.modeling_utils import Pooling

In [7]:
from transformers import AutoModel, AutoTokenizer, BatchEncoding

In [8]:
from sugar.core import *

## Inference

In [8]:
test_info_file = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/(mapped)LF-AmazonTitles-1.3M//raw_data/test.raw.txt'
meta_info_file = '/home/scai/phd/aiz218323/scratch/datasets/benchmarks/LF-AmazonTitles-1.3M_gpt-conflations/raw_data/category_gpt_conflated-1.raw.csv'

In [9]:
test_info = load_raw_file(test_info_file)[1]
meta_info = load_raw_file(meta_info_file)[1]

In [10]:
model = AutoModel.from_pretrained('distilbert-base-uncased')
tokz = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [11]:
test_tokens = tokz(test_info, return_tensors='pt', padding=True, truncation=True)
meta_tokens = tokz(meta_info, return_tensors='pt', padding=True, truncation=True)

In [12]:
class DataloaderLite(Dataset):

    def __init__(self, data, batch_size):
        self.data = data
        self.num_data = len(data['input_ids'])
        self.current_position = 0
        self.batch_size = batch_size

    def next_data(self):
        batch = BatchEncoding({k:v[self.current_position:self.current_position+self.batch_size] for k,v in self.data.items()})
        self.current_position += self.batch_size
        
        if self.current_position > self.num_data:
            self.current_position = 0

        return batch

    def __len__(self):
        return math.ceil(self.num_data/self.batch_size)
        

In [13]:
model = model.to('cuda')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
test_dataloader = DataloaderLite(test_tokens, batch_size=100)
meta_dataloader = DataloaderLite(meta_tokens, batch_size=100)

In [15]:
test_repr = []
for i in tqdm(range(len(test_dataloader))):
    batch = test_dataloader.next_data()
    batch = batch.to(model.device)
    output = model(**batch)
    repr = F.normalize(Pooling.mean_pooling(output[0], batch['attention_mask']), dim=1)
    test_repr.append(repr.cpu())
    

  0%|                                                                                                                                          | 2/9703 [00:02<3:59:34,  1.48s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 150.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 122.19 MiB is free. Process 24832 has 708.00 MiB memory in use. Including non-PyTorch memory, this process has 30.92 GiB memory in use. Of the allocated memory 30.48 GiB is allocated by PyTorch, and 79.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Predictions

In [9]:
pkl_dir = '/scratch/scai/phd/aiz218323/datasets/processed/'
pkl_file = f'{pkl_dir}/mogicX/amazontitles_data-gpt-category-conflated-1_distilbert-base-uncased_sxc.joblib'

config_file = '../configs/16_ngame-linker-for-amazontitles-003_gpt-category-conflated-1.json'
config_key = 'data_category'

block = build_block(pkl_file, config_file, use_sxc=True, config_key=config_key)
linker_block = block.linker_dset('cat_meta', remove_empty=False)

In [15]:
pred_dir = '/home/scai/phd/aiz218323/scratch/outputs/mogicX/16_ngame-linker-for-amazontitles-004/predictions/'
pred_lbl = sp.load_npz(f'{pred_dir}/test_predictions_zs.npz')

pred_block = get_pred_dset(retain_topk(pred_lbl, k=10), linker_block.test.dset)

In [17]:
dset = TextDataset(pred_block)

In [22]:
save_dir = '/home/scai/phd/aiz218323/scratch/outputs/mogicX/16_ngame-linker-for-amazontitles-004/examples'
os.makedirs(save_dir, exist_ok=True)

In [31]:
idxs = np.random.permutation(pred_lbl.shape[0])[:1000]
dset.dump(f'{save_dir}/zero_shot.txt', idxs)

In [30]:
dset.show()

[5m[7m[36mdata_input_text[0m [36m: New York City Transit Buses 1945-1975 Photo Archive[0m
[5m[7m[91mlbl2data_input_text[0m [91m: ['Urban Transportation History Book', 'New York City History Book', 'Public Transportation Book', 'Transportation Memorabilia', 'Vintage Transportation Documentary', 'Public Transportation', 'Urban Transit Merchandise', 'Toy Bus', 'Rail Transit Books', 'Urban Transport Book'][0m

[5m[7m[36mdata_input_text[0m [36m: AC/DC - Family Jewels[0m
[5m[7m[91mlbl2data_input_text[0m [91m: ['Family & Genealogy', 'Family Biography', 'Gifts for Family Members', 'Family Movie Collection', 'Family DVD', "Children's Gifts & Decorations", 'Family Clothing', 'DC Motor', 'Family Photo & Albums', 'DC Collectibles'][0m

[5m[7m[36mdata_input_text[0m [36m: NBA New Orleans Hornets Wool Blend Adjustable Snapback Hat, One Size,  Blue[0m
[5m[7m[91mlbl2data_input_text[0m [91m: ['Bowling Shoes', 'Wide Brim Hat', "Men's Basketball Shoes", 'Rain Boots', 'Ja