# RADGA

In [1]:
#| default_exp 47-encoder-parallel-radga-for-wikiseealso

In [2]:
%load_ext autoreload
%autoreload 2

from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [3]:
#| export
import os,sys,torch,pickle,torch.multiprocessing as mp, pickle
from xcai.basics import *
from xcai.models.PPP0XX import DBT020

In [4]:
os.environ['WANDB_MODE'] = 'disabled'

In [5]:
#| export
os.environ['CUDA_VISIBLE_DEVICES'] = '12,13,14,15'
os.environ['WANDB_PROJECT']='xc-nlg_38-radga-training-pipeline'

## Data

In [6]:
#| export
data_dir = '/home/aiscuser/scratch/datasets'

In [7]:
block = XCBlock.from_cfg(data_dir, 'data_metas', valid_pct=0.001, tfm='rm', tokenizer='distilbert-base-uncased', 
                         smp_features=[('lbl2data|cat2lbl2data|hlk2lbl2data',1, 1), ('cat2data',1, 1), ('hlk2data',1,3)], 
                         n_data_meta_samples=50, n_lbl_meta_samples=50)

  self._set_arrayXarray(i, j, x)


In [10]:
block.collator.tfms.tfms.append(RemoveColumnTfm(['phlk2data_idx', 'phlk2data_data2ptr', 'phlk2lbl_idx', 
                                                 'phlk2lbl_lbl2data2ptr', 'phlk2lbl_data2ptr', 'hlk2lbl_data2ptr']))

In [7]:
#| export
pkl_dir = f'{data_dir}/processed/'

In [12]:
with open(f'{pkl_dir}/wikiseealso_data-metas_distilbert-base-uncased_rm_radga-encoder-parallel.pkl', 'wb') as file: 
    pickle.dump(block, file)

In [8]:
#| export
with open(f'{pkl_dir}/wikiseealso_data-metas_distilbert-base-uncased_rm_radga-encoder-parallel.pkl', 'rb') as file: 
    block = pickle.load(file)

In [9]:
#| export
args = XCLearningArguments(
    output_dir='/home/aiscuser/outputs/47-encoder-parallel-radga-for-wikiseealso-1-0',
    logging_first_step=True,
    per_device_train_batch_size=600,
    per_device_eval_batch_size=100,
    representation_num_beams=200,
    representation_accumulation_steps=1,
    save_strategy="steps",
    evaluation_strategy='steps',
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=5,
    num_train_epochs=300,
    predict_with_representation=True,
    adam_epsilon=1e-6,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-4,
    generation_num_beams=10,
    generation_length_penalty=1.5,
    predict_with_generation=True,
    representation_search_type='INDEX',
    group_by_cluster=True,
    num_clustering_warmup_epochs=10,
    num_cluster_update_epochs=5,
    num_cluster_size_update_epochs=10,
    clustering_type='EXPO',
    minimum_cluster_size=1,
    maximum_cluster_size=300,
    output_concatenation_weight=1.0,
    metric_for_best_model='P@1_REPR',
    target_indices_key='plbl2data_idx',
    target_pointer_key='plbl2data_data2ptr',
    fp16=True,
    label_names=['cat2data_idx', 'cat2data_input_ids', 'cat2data_attention_mask',
                 'cat2lbl2data_idx', 'cat2lbl2data_input_ids', 'cat2lbl2data_attention_mask',
                 'hlk2data_input_ids', 'hlk2data_attention_mask', 'hlk2data_idx',
                 'hlk2lbl2data_input_ids', 'hlk2lbl2data_attention_mask', 'hlk2lbl2data_idx'
                ],
)



In [10]:
#| export
test_dset = block.test.dset.sample(n=2000, seed=50)
metric = PrecRecl(block.n_lbl, test_dset.data.data_lbl_filterer, prop=block.train.dset.data.data_lbl,
                  pk=10, rk=200, rep_pk=[1, 3, 5, 10], rep_rk=[10, 100, 200])

In [11]:
#| export
bsz = max(args.per_device_train_batch_size, args.per_device_eval_batch_size)*torch.cuda.device_count()

model = DBT020.from_pretrained('distilbert-base-uncased', ig_tok=0, bsz=bsz, tn_targ=5000, tn_meta=5000, 
                               margin=0.3, tau=0.1, n_negatives=5, apply_softmax=True, lw=0.01, m_lw=0.3, 
                               pred_meta_prefix='cat', data_aug_meta_prefix='hlk2data', 
                               lbl2data_aug_meta_prefix='hlk2lbl', tie_word_embeddings=False)

model.init_dr_head()

Some weights of DBT020 were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['encoder.module.dr_layer_norm.bias', 'encoder.module.dr_layer_norm.weight', 'encoder.module.dr_projector.bias', 'encoder.module.dr_projector.weight', 'encoder.module.dr_transform.bias', 'encoder.module.dr_transform.weight', 'encoder.module.fuser.k.bias', 'encoder.module.fuser.k.weight', 'encoder.module.fuser.o.bias', 'encoder.module.fuser.o.weight', 'encoder.module.fuser.q.bias', 'encoder.module.fuser.q.weight', 'encoder.module.fuser.v.bias', 'encoder.module.fuser.v.weight', 'encoder.module.ln.bias', 'encoder.module.ln.weight', 'encoder.module.vocab_projector.weight', 'vocab_projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
#| export
trie = XCTrie.from_block(block)

  0%|          | 0/312330 [00:00<?, ?it/s]

In [13]:
#| export
learn = XCLearner(
    model=model, 
    args=args,
    trie=trie,
    train_dataset=block.train.dset,
    eval_dataset=test_dset,
    data_collator=block.collator,
    compute_metrics=metric,
)

In [14]:
learn.train()

[2024-06-01 16:26:53,231] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)




  0%|          | 0/1731 [00:00<?, ?it/s]

node-0:4149074:4149074 [0] NCCL INFO Bootstrap : Using eth0:10.13.60.215<0>
node-0:4149074:4149074 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
node-0:4149074:4149074 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
node-0:4149074:4149074 [0] NCCL INFO Kernel version: 5.15.0-1042-azure
RCCL version 2.17.1+hip5.7 HEAD:cbbb3d8+

node-0:4149074:4151771 [3] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/misc/ibvwrap.cc:222 NCCL WARN Call to ibv_open_device failed

node-0:4149074:4151771 [3] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/transport/net_ib.cc:199 NCCL WARN NET/IB : Unable to open device mlx5_0

node-0:4149074:4151771 [3] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/misc/ibvwrap.cc:222 NCCL WARN Call to ibv_open_device failed

nod

node-0:4149074:4151768 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 3/-1/-1->1->0 [2] -1/-1/-1->1->3 [3] -1/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 3/-1/-1->1->0 [6] -1/-1/-1->1->3 [7] -1/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 3/-1/-1->1->0 [10] -1/-1/-1->1->3 [11] -1/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 3/-1/-1->1->0 [14] -1/-1/-1->1->3 [15] -1/-1/-1->1->0 comm 0xee1b730 nRanks 04 busId e00000
node-0:4149074:4151768 [1] NCCL INFO P2P Chunksize set to 524288
node-0:4149074:4151767 [0] NCCL INFO Ring 11 : 1 -> 0 -> 2 comm 0x106325f0 nRanks 04 busId d00000
node-0:4149074:4151769 [2] NCCL INFO Ring 15 : 0 -> 2 -> 3 comm 0x106f7dd0 nRanks 04 busId f00000
node-0:4149074:4151767 [0] NCCL INFO Ring 12 : 2 -> 0 -> 1 comm 0x106325f0 nRanks 04 busId d00000
node-0:4149074:4151769 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] -1/-1/-1->2->3 [2] 3/-1/-1->2->0 [3] 0/-1/-1->2->3 [4] 3/-1/-1->2->1 [5] -1/-1/-1->2->3 [6] 3/-1/-1->2->0 [7] 0/-1/-1->2->3 [8] 3/-1/-1->2->1 [9] -1/-1/-1->2->3 [10] 3/-1/-1->2->0 [11] 0/-1/-1-

node-0:4149074:4151768 [1] NCCL INFO Channel 03/0 : 1[e00000] -> 0[d00000] via P2P/direct pointer comm 0xee1b730 nRanks 04
node-0:4149074:4151771 [3] NCCL INFO Channel 04/0 : 3[1000000] -> 2[f00000] via P2P/direct pointer comm 0x11e226c0 nRanks 04
node-0:4149074:4151768 [1] NCCL INFO Channel 06/0 : 1[e00000] -> 0[d00000] via P2P/direct pointer comm 0xee1b730 nRanks 04
node-0:4149074:4151771 [3] NCCL INFO Channel 05/0 : 3[1000000] -> 2[f00000] via P2P/direct pointer comm 0x11e226c0 nRanks 04
node-0:4149074:4151768 [1] NCCL INFO Channel 07/0 : 1[e00000] -> 0[d00000] via P2P/direct pointer comm 0xee1b730 nRanks 04
node-0:4149074:4151771 [3] NCCL INFO Channel 08/0 : 3[1000000] -> 2[f00000] via P2P/direct pointer comm 0x11e226c0 nRanks 04
node-0:4149074:4151768 [1] NCCL INFO Channel 10/0 : 1[e00000] -> 0[d00000] via P2P/direct pointer comm 0xee1b730 nRanks 04
node-0:4149074:4151771 [3] NCCL INFO Channel 09/0 : 3[1000000] -> 2[f00000] via P2P/direct pointer comm 0x11e226c0 nRanks 04
node-0:4

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/.singularity/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3524, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_4149074/1093534927.py", line 1, in <cell line: 1>
    learn.train()
  File "/opt/conda/envs/ptca/lib/python3.9/site-packages/transformers/trainer.py", line 1885, in train
    return inner_training_loop(
  File "/home/aiscuser/scratch/Projects/xcai/xcai/learner.py", line 946, in _inner_training_loop
    self.update_dataloader_sampler(train_dataloader, epoch, num_train_epochs)
  File "/home/aiscuser/scratch/Projects/xcai/xcai/learner.py", line 660, in update_dataloader_sampler
    cluster = self._get_train_data_cluster(epochs_trained, num_train_epochs)
  File "/home/aiscuser/scratch/Projects/xcai/xcai/learner.py", line 653, in _get_train_data_cluster
    data_repr = self.get_representation(dataloader)
  File "/home/aiscuser/scratch/Projects/xcai/xcai/learner.py", line

TypeError: object of type 'NoneType' has no len()

In [None]:
#| export
if __name__ == '__main__':
    mp.freeze_support()
    learn.train()
    