# RADGA

In [2]:
#| default_exp 48-encoder-parallel-radga-with-cross-attention-loss-component-for-wikiseealso

In [3]:
%load_ext autoreload
%autoreload 2

from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [4]:
#| export
import os,sys,torch,pickle,torch.multiprocessing as mp, pickle
from xcai.basics import *
from xcai.models.radga import RAD001

In [5]:
os.environ['WANDB_MODE'] = 'disabled'

In [6]:
#| export
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5'
os.environ['WANDB_PROJECT']='xc-nlg_48-encoder-parallel-radga-with-cross-attention-loss-component-for-wikiseealso'

## Data

In [7]:
#| export
data_dir = '/home/aiscuser/scratch/datasets'

In [7]:
block = XCBlock.from_cfg(data_dir, 'data_metas', valid_pct=0.001, tfm='rm', tokenizer='distilbert-base-uncased', 
                         smp_features=[('lbl2data|cat2lbl2data|hlk2lbl2data',1, (1,1,3)), ('cat2data',1, 1), ('hlk2data',1,3)], 
                         n_data_meta_samples=50, n_lbl_meta_samples=50)

  self._set_arrayXarray(i, j, x)


In [10]:
block.collator.tfms.tfms.append(RemoveColumnTfm(['phlk2data_idx', 'phlk2data_data2ptr', 'phlk2lbl_idx', 
                                                 'phlk2lbl_lbl2data2ptr', 'phlk2lbl_data2ptr', 'hlk2lbl_data2ptr']))

In [8]:
#| export
pkl_dir = f'{data_dir}/processed/'

In [12]:
with open(f'{pkl_dir}/wikiseealso_data-metas_distilbert-base-uncased_rm_radga-final.pkl', 'wb') as file: 
    pickle.dump(block, file)

In [9]:
#| export
with open(f'{pkl_dir}/wikiseealso_data-metas_distilbert-base-uncased_rm_radga-final.pkl', 'rb') as file: 
    block = pickle.load(file)

## Training

In [10]:
#| export
args = XCLearningArguments(
    output_dir='/home/aiscuser/outputs/48-encoder-parallel-radga-with-cross-attention-loss-component-for-wikiseealso-1-0',
    logging_first_step=True,
    per_device_train_batch_size=200,
    per_device_eval_batch_size=100,
    representation_num_beams=200,
    representation_accumulation_steps=1,
    save_strategy="steps",
    evaluation_strategy='steps',
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=5,
    num_train_epochs=300,
    predict_with_representation=True,
    adam_epsilon=1e-6,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-4,
    generation_num_beams=10,
    generation_length_penalty=1.5,
    predict_with_generation=True,
    representation_search_type='INDEX',
    group_by_cluster=True,
    num_clustering_warmup_epochs=10,
    num_cluster_update_epochs=5,
    num_cluster_size_update_epochs=10,
    clustering_type='EXPO',
    minimum_cluster_size=1,
    maximum_cluster_size=300,
    output_concatenation_weight=1.0,
    use_encoder_parallel=True,
    metric_for_best_model='P@1_REPR',
    target_indices_key='plbl2data_idx',
    target_pointer_key='plbl2data_data2ptr',
    fp16=True,
    label_names=['cat2data_idx', 'cat2data_input_ids', 'cat2data_attention_mask',
                 'cat2lbl2data_idx', 'cat2lbl2data_input_ids', 'cat2lbl2data_attention_mask',
                 'hlk2data_idx', 'hlk2data_input_ids', 'hlk2data_attention_mask',
                 'hlk2lbl2data_idx', 'hlk2lbl2data_input_ids', 'hlk2lbl2data_attention_mask',],
)



In [11]:
#| export
test_dset = block.test.dset.sample(n=2000, seed=50)
metric = PrecRecl(block.n_lbl, test_dset.data.data_lbl_filterer, prop=block.train.dset.data.data_lbl,
                  pk=10, rk=200, rep_pk=[1, 3, 5, 10], rep_rk=[10, 100, 200])

In [12]:
#| export
bsz = max(args.per_device_train_batch_size, args.per_device_eval_batch_size)*torch.cuda.device_count()

model = RAD001.from_pretrained('distilbert-base-uncased', num_batch_labels=5000, ignore_token=0, batch_size=bsz,
                               margin=0.3, num_negatives=5, tau=0.1, apply_softmax=True,
                               
                               data_aug_meta_prefix='hlk2data', lbl2data_aug_meta_prefix='hlk2lbl', 
                               resize_length=5000,
                               
                               gen_loss_weight=0.001, meta_loss_weight=0.3, pred_meta_prefix='cat', 
                               
                               fusion_loss_weight=0.1, tie_word_embeddings=False,
                               
                               use_fusion_loss=True, use_noise=True, use_encoder_parallel=True)

model.init_retrieval_head()
model.init_generation_head()

Some weights of RAD001 were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['encoder.cross_head.k.bias', 'encoder.cross_head.k.weight', 'encoder.cross_head.layer_norm.bias', 'encoder.cross_head.layer_norm.weight', 'encoder.cross_head.o.bias', 'encoder.cross_head.o.weight', 'encoder.cross_head.q.bias', 'encoder.cross_head.q.weight', 'encoder.cross_head.v.bias', 'encoder.cross_head.v.weight', 'encoder.dr_head.layer_norm.bias', 'encoder.dr_head.layer_norm.weight', 'encoder.dr_head.projector.bias', 'encoder.dr_head.projector.weight', 'encoder.dr_head.transform.bias', 'encoder.dr_head.transform.weight', 'encoder.gen_head.projector.weight', 'encoder.meta_head.layer_norm.bias', 'encoder.meta_head.layer_norm.weight', 'encoder.meta_head.projector.bias', 'encoder.meta_head.projector.weight', 'encoder.meta_head.transform.bias', 'encoder.meta_head.transform.weight', 'vocab_projector.weight']
You should probably TRAIN this model on a down-stream task

In [13]:
#| export
trie = XCTrie.from_block(block)

  0%|          | 0/312330 [00:00<?, ?it/s]

In [14]:
#| export
learn = XCLearner(
    model=model, 
    args=args,
    trie=trie,
    train_dataset=block.train.dset,
    eval_dataset=test_dset,
    data_collator=block.collator,
    compute_metrics=metric,
)

In [None]:
learn.train()

[2024-06-05 04:29:57,244] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)




  0%|          | 0/1154 [00:00<?, ?it/s]

node-0:988779:988779 [0] NCCL INFO Bootstrap : Using eth0:10.13.60.215<0>
node-0:988779:988779 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
node-0:988779:988779 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
node-0:988779:988779 [0] NCCL INFO Kernel version: 5.15.0-1042-azure
RCCL version 2.17.1+hip5.7 HEAD:cbbb3d8+

node-0:988779:992205 [0] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/misc/ibvwrap.cc:222 NCCL WARN Call to ibv_open_device failed

node-0:988779:992205 [0] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/transport/net_ib.cc:199 NCCL WARN NET/IB : Unable to open device mlx5_0

node-0:988779:992205 [0] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/misc/ibvwrap.cc:222 NCCL WARN Call to ibv_open_device failed

node-0:988779:992

node-0:988779:992211 [4] NCCL INFO Ring 3 : 5 -> 4 -> 0 comm 0x1aee07f0 nRanks 06 busId 500000
node-0:988779:992211 [4] NCCL INFO Ring 4 : 0 -> 4 -> 5 comm 0x1aee07f0 nRanks 06 busId 500000
node-0:988779:992211 [4] NCCL INFO Ring 5 : 5 -> 4 -> 0 comm 0x1aee07f0 nRanks 06 busId 500000
node-0:988779:992212 [5] NCCL INFO Ring 7 : 1 -> 5 -> 4 comm 0x18593f40 nRanks 06 busId 600000
node-0:988779:992211 [4] NCCL INFO Ring 6 : 0 -> 4 -> 5 comm 0x1aee07f0 nRanks 06 busId 500000
node-0:988779:992212 [5] NCCL INFO Trees [0] 1/-1/-1->5->4 [1] 4/-1/-1->5->1 [2] 1/-1/-1->5->4 [3] 4/-1/-1->5->1 [4] 1/-1/-1->5->4 [5] 4/-1/-1->5->1 [6] 1/-1/-1->5->4 [7] 4/-1/-1->5->1 comm 0x18593f40 nRanks 06 busId 600000
node-0:988779:992212 [5] NCCL INFO P2P Chunksize set to 524288
node-0:988779:992211 [4] NCCL INFO Ring 7 : 5 -> 4 -> 0 comm 0x1aee07f0 nRanks 06 busId 500000
node-0:988779:992211 [4] NCCL INFO Trees [0] 5/-1/-1->4->0 [1] -1/-1/-1->4->5 [2] 5/-1/-1->4->0 [3] -1/-1/-1->4->5 [4] 5/-1/-1->4->0 [5] -1/-1/

In [None]:
#| export
if __name__ == '__main__':
    mp.freeze_support()
    learn.train()
    

## Evaluate

In [9]:
args = XCLearningArguments(
    output_dir='/home/aiscuser/outputs/48-encoder-parallel-radga-with-cross-attention-loss-component-for-wikiseealso-1-0',
    per_device_train_batch_size=200,
    per_device_eval_batch_size=400,
    representation_num_beams=200,
    representation_accumulation_steps=1,
    eval_steps=1000,
    save_steps=1000,
    predict_with_representation=True,
    generation_num_beams=10,
    generation_length_penalty=1.5,
    predict_with_generation=True,
    representation_search_type='BRUTEFORCE',
    output_concatenation_weight=1.0,
    metric_for_best_model='P@1_REPR',
    target_indices_key='plbl2data_idx',
    target_pointer_key='plbl2data_data2ptr',
    fp16=True,
    label_names=['cat2data_idx', 'cat2data_input_ids', 'cat2data_attention_mask',
                 'cat2lbl2data_idx', 'cat2lbl2data_input_ids', 'cat2lbl2data_attention_mask',
                 'hlk2data_idx', 'hlk2data_input_ids', 'hlk2data_attention_mask',
                 'hlk2lbl2data_idx', 'hlk2lbl2data_input_ids', 'hlk2lbl2data_attention_mask',],
)

In [10]:
metric = PrecRecl(block.n_lbl, block.test.data_lbl_filterer, prop=block.train.dset.data.data_lbl,
                  pk=10, rk=200, rep_pk=[1, 3, 5, 10], rep_rk=[10, 100, 200])

In [11]:
output_dir = "/home/aiscuser/scratch/Projects/xc_nlg/outputs/"
mdir = f"{output_dir}/{os.path.basename(args.output_dir)}"

mname = f"{mdir}/{os.path.basename(get_best_model(mdir))}"

In [12]:
bsz = max(args.per_device_train_batch_size, args.per_device_eval_batch_size)*torch.cuda.device_count()

model = RAD001.from_pretrained(mname, num_batch_labels=10_000, ignore_token=0, batch_size=bsz,
                               margin=0.3, num_negatives=5, tau=0.1, apply_softmax=True,
                               data_aug_meta_prefix='hlk2data', lbl2data_aug_meta_prefix='hlk2lbl', 
                               resize_length=10_000,
                               gen_loss_weight=0.001, meta_loss_weight=0.3, pred_meta_prefix='cat', 
                               fusion_loss_weight=0.1, tie_word_embeddings=False)

model.activate_encoder_parallel()

Some weights of the model checkpoint at /home/aiscuser/scratch/Projects/xc_nlg/outputs//48-encoder-parallel-radga-with-cross-attention-loss-component-for-wikiseealso-1-0/checkpoint-2000 were not used when initializing RAD001: ['encoder.module.cross_head.k.bias', 'encoder.module.cross_head.k.weight', 'encoder.module.cross_head.layer_norm.bias', 'encoder.module.cross_head.layer_norm.weight', 'encoder.module.cross_head.o.bias', 'encoder.module.cross_head.o.weight', 'encoder.module.cross_head.q.bias', 'encoder.module.cross_head.q.weight', 'encoder.module.cross_head.v.bias', 'encoder.module.cross_head.v.weight', 'encoder.module.dr_head.layer_norm.bias', 'encoder.module.dr_head.layer_norm.weight', 'encoder.module.dr_head.projector.bias', 'encoder.module.dr_head.projector.weight', 'encoder.module.dr_head.transform.bias', 'encoder.module.dr_head.transform.weight', 'encoder.module.meta_head.layer_norm.bias', 'encoder.module.meta_head.layer_norm.weight', 'encoder.module.meta_head.projector.bias'

In [13]:
from safetensors import safe_open

model_state_dict = {}
with safe_open(f"{mname}/model.safetensors", framework="pt") as file:
    for k in file.keys(): model_state_dict[k] = file.get_tensor(k)

model.load_state_dict(model_state_dict, strict=False)

_IncompatibleKeys(missing_keys=['encoder.module.gen_head.transform.weight', 'encoder.module.gen_head.transform.bias', 'encoder.module.gen_head.layer_norm.weight', 'encoder.module.gen_head.layer_norm.bias', 'encoder.module.gen_head.projector.weight', 'encoder.module.gen_head.projector.bias', 'encoder.module.distilbert.embeddings.word_embeddings.weight', 'encoder.module.distilbert.embeddings.position_embeddings.weight', 'encoder.module.distilbert.embeddings.LayerNorm.weight', 'encoder.module.distilbert.embeddings.LayerNorm.bias', 'encoder.module.distilbert.transformer.layer.0.attention.q_lin.weight', 'encoder.module.distilbert.transformer.layer.0.attention.q_lin.bias', 'encoder.module.distilbert.transformer.layer.0.attention.k_lin.weight', 'encoder.module.distilbert.transformer.layer.0.attention.k_lin.bias', 'encoder.module.distilbert.transformer.layer.0.attention.v_lin.weight', 'encoder.module.distilbert.transformer.layer.0.attention.v_lin.bias', 'encoder.module.distilbert.transformer.l

In [14]:
trie = XCTrie.from_block(block)

  0%|          | 0/312330 [00:00<?, ?it/s]

In [15]:
learn = XCLearner(
    model=model, 
    args=args,
    trie=trie,
    train_dataset=block.train.dset,
    eval_dataset=block.test.dset,
    data_collator=block.collator,
    compute_metrics=metric,
)

In [17]:
model.use_generation = False

In [18]:
o = learn.predict(block.test.dset)

[2024-06-04 18:02:29,076] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  0%|          | 0/391 [00:00<?, ?it/s]

node-0:140492:140492 [0] NCCL INFO Bootstrap : Using eth0:10.13.51.163<0>
node-0:140492:140492 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
node-0:140492:140492 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
node-0:140492:140492 [0] NCCL INFO Kernel version: 5.15.0-1042-azure
RCCL version 2.17.1+hip5.7 HEAD:cbbb3d8+

node-0:140492:142160 [1] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/misc/ibvwrap.cc:222 NCCL WARN Call to ibv_open_device failed

node-0:140492:142160 [1] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/transport/net_ib.cc:199 NCCL WARN NET/IB : Unable to open device mlx5_0

node-0:140492:142160 [1] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/misc/ibvwrap.cc:222 NCCL WARN Call to ibv_open_device failed

node-0:140492:142

node-0:140492:142159 [0] NCCL INFO Ring 12 : 1 -> 0 -> 1 comm 0xebd8080 nRanks 02 busId 700000
node-0:140492:142159 [0] NCCL INFO Ring 13 : 1 -> 0 -> 1 comm 0xebd8080 nRanks 02 busId 700000
node-0:140492:142159 [0] NCCL INFO Ring 14 : 1 -> 0 -> 1 comm 0xebd8080 nRanks 02 busId 700000
node-0:140492:142159 [0] NCCL INFO Ring 15 : 1 -> 0 -> 1 comm 0xebd8080 nRanks 02 busId 700000
node-0:140492:142159 [0] NCCL INFO Ring 16 : 1 -> 0 -> 1 comm 0xebd8080 nRanks 02 busId 700000
node-0:140492:142159 [0] NCCL INFO Ring 17 : 1 -> 0 -> 1 comm 0xebd8080 nRanks 02 busId 700000
node-0:140492:142159 [0] NCCL INFO Ring 18 : 1 -> 0 -> 1 comm 0xebd8080 nRanks 02 busId 700000
node-0:140492:142159 [0] NCCL INFO Ring 19 : 1 -> 0 -> 1 comm 0xebd8080 nRanks 02 busId 700000
node-0:140492:142159 [0] NCCL INFO Ring 20 : 1 -> 0 -> 1 comm 0xebd8080 nRanks 02 busId 700000
node-0:140492:142159 [0] NCCL INFO Ring 21 : 1 -> 0 -> 1 comm 0xebd8080 nRanks 02 busId 700000
node-0:140492:142159 [0] NCCL INFO Ring 22 : 1 -> 

  return torch.sparse_csr_tensor(data_ptr, data_idx, scores, device=data_ptr.device)


  self._set_arrayXarray(i, j, x)


In [20]:
o.metrics

{'test_loss': 0.15632189810276031,
 'test_P@1': 0.22487113765033942,
 'test_P@10': 0.07467932287420534,
 'test_P@3': 0.149833347416588,
 'test_P@5': 0.11438131988856777,
 'test_N@1': 0.22487114369869232,
 'test_N@10': 0.2578989565372467,
 'test_N@3': 0.2282901555299759,
 'test_N@5': 0.23904120922088623,
 'test_PSP@1': 0.19654354095747512,
 'test_PSP@10': 0.25414336451319947,
 'test_PSP@3': 0.20332590644527287,
 'test_PSP@5': 0.21838125210154405,
 'test_PSN@1': 0.19654352962970734,
 'test_PSN@10': 0.24589276313781738,
 'test_PSN@3': 0.21225321292877197,
 'test_PSN@5': 0.2263062298297882,
 'test_R@200': 0.5365164786256474,
 'test_R@10': 0.31042257210201535,
 'test_R@100': 0.4869897210391535,
 'test_runtime': 432.605,
 'test_samples_per_second': 410.34,
 'test_steps_per_second': 0.513}