# RADGA training pipeline

In [1]:
#| default_exp 42-radga-training-pipeline-for-wikititles

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#| hide
from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [4]:
#| export
import os,sys,torch,pickle,torch.multiprocessing as mp
from xcai.basics import *

In [5]:
os.environ['WANDB_MODE'] = 'disabled'

In [6]:
#| export
os.environ['CUDA_VISIBLE_DEVICES'] = '4,5'
os.environ['WANDB_PROJECT']='xc-nlg_42-radga-training-pipeline-for-wikititles'

sys.path.append('/home/aiscuser/scratch/Projects/xc_nlg')

In [7]:
#| export
from xc_nlg.models.radga import *

## Data

In [8]:
#| export
block = XCBlock.from_cfg('/home/aiscuser/scratch/datasets', 'data_metas', dset='wikititles', valid_pct=0.001, 
                         tfm='xcnlg', tokenizer='distilbert-base-uncased', 
                         smp_features=[('lbl2data',1,2), ('sal2data',1,1), ('hlk2data',1,3)], 
                         n_data_meta_samples=50, n_lbl_meta_samples=5)

  self._set_arrayXarray(i, j, x)


In [9]:
pkl_dir = '/home/aiscuser/scratch/datasets/processed/'
with open(f'{pkl_dir}/wikiseealso-radga.pkl', 'wb') as file: pickle.dump(block, file)

In [29]:
pkl_dir = '/home/aiscuser/scratch/datasets/processed/'
with open(f'{pkl_dir}/wikiseealso-radga.pkl', 'rb') as file: block = pickle.load(file)

In [19]:
#| export
def remove_columns(x):
    for o in ['phlk2data_idx', 'phlk2data_data2ptr']:
        if o in x: x.pop(o)
    return x

block.collator.tfms.tfms.append(remove_columns)

In [21]:
#| export
args = XCLearningArguments(
    output_dir='/home/aiscuser/outputs/42-radga-training-pipeline-for-wikititles-2-0',
    logging_first_step=True,
    per_device_train_batch_size=800,
    per_device_eval_batch_size=800,
    representation_num_beams=200,
    representation_accumulation_steps=100,
    save_strategy="steps",
    evaluation_strategy='steps',
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=5,
    num_train_epochs=300,
    predict_with_representation=True,
    adam_epsilon=1e-6,
    warmup_steps=100,
    weight_decay=0.1,
    learning_rate=2e-4,
    generation_num_beams=10,
    generation_length_penalty=1.5,
    predict_with_generation=True,
    representation_search_type='BRUTEFORCE',
    group_by_cluster=True,
    num_clustering_warmup_epochs=1,
    num_cluster_update_epochs=2,
    num_cluster_size_update_epochs=4,
    clustering_type='EXPO',
    minimum_cluster_size=1,
    maximum_cluster_size=300,
    output_concatenation_weight=1.0,
    metric_for_best_model='P@1_REPR',
    target_indices_key='plbl2data_idx',
    target_pointer_key='plbl2data_data2ptr',
    fp16=True,
    label_names=['sal2data_input_ids', 'sal2data_attention_mask', 'sal2data_data2ptr', 'sal2data_idx', 
                 'psal2data_idx', 'psal2data_data2ptr',
                 
                 'hlk2data_input_ids', 'hlk2data_attention_mask', 'hlk2data_data2ptr', 'hlk2data_idx'],
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
#| export
test_dset = block.test.dset.sample(n=2000, seed=50)
metric = PrecRecl(block.n_lbl, test_dset.data.data_lbl_filterer, prop=block.train.dset.data.data_lbl,
                  pk=10, rk=200, rep_pk=[1, 3, 5, 10], rep_rk=[10, 100, 200])

In [23]:
#| export
bsz = max(args.per_device_train_batch_size, args.per_device_eval_batch_size)*torch.cuda.device_count()

model = DBT019.from_pretrained('distilbert-base-uncased', ig_tok=0, bsz=bsz, tn_targ=10_000, tn_meta=10_000, 
                               margin=0.3, tau=0.1, n_negatives=5, apply_softmax=True, lw=0.01, m_lw=0.1, 
                               pred_meta_prefix='sal', aug_meta_prefix='hlk', init_drh=True)

Some weights of DBT019 were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dr_layer_norm.bias', 'dr_layer_norm.weight', 'dr_projector.bias', 'dr_projector.weight', 'dr_transform.bias', 'dr_transform.weight', 'fuser.k.bias', 'fuser.k.weight', 'fuser.layer_norm.bias', 'fuser.layer_norm.weight', 'fuser.o.bias', 'fuser.o.weight', 'fuser.q.bias', 'fuser.q.weight', 'fuser.v.bias', 'fuser.v.weight', 'gen_lfn.o', 'rep_lfn.u', 'rep_lfn.v']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
#| export
trie = XCTrie.from_block(block)

  0%|          | 0/312330 [00:00<?, ?it/s]

In [25]:
#| export
learn = XCLearner(
    model=model, 
    args=args,
    trie=trie,
    train_dataset=block.train.dset,
    eval_dataset=test_dset,
    data_collator=block.collator,
    compute_metrics=metric,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
#| export
if __name__ == '__main__':
    mp.freeze_support()
    learn.train()