In [None]:
#| default_exp 64-ngame-ep-for-wikiseealso-with-entropy-loss

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import os,torch, torch.multiprocessing as mp, pickle
from xcai.basics import *
from xcai.models.PPP0XX import DBT012

comet_ml is installed but `COMET_API_KEY` is not set.


In [None]:
os.environ['WANDB_MODE'] = 'disabled'

In [None]:
#| export
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
os.environ['WANDB_PROJECT']='xc-nlg_66-radga-dr-ep-for-wikiseealso'

In [None]:
#| export
data_dir = '/home/scai/phd/aiz218323/Projects/XC_NLG/data'

In [None]:
block = XCBlock.from_cfg(data_dir, 'data', tfm='xcnlg', tokenizer='distilbert-base-uncased', smp_features=[('lbl2data',1,1)])

In [None]:
#| export
pkl_dir = '/home/scai/phd/aiz218323/scratch/datasets'
pkl_file = f'{pkl_dir}/processed/wikiseealso_data_distilbert-base-uncased_xcnlg_ngame.pkl'

In [None]:
with open(pkl_file, 'wb') as file: pickle.dump(block, file)

In [None]:
#| export
with open(pkl_file, 'rb') as file: block = pickle.load(file)

## Training

In [None]:
#| export
args = XCLearningArguments(
    output_dir='/home/scai/phd/aiz218323/scratch/outputs/64-ngame-ep-for-wikiseealso-with-entropy-loss-1-0',
    logging_first_step=True,
    per_device_train_batch_size=800,
    per_device_eval_batch_size=800,
    representation_num_beams=200,
    representation_accumulation_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=5,
    num_train_epochs=300,
    predict_with_representation=True,
    representation_search_type='INDEX',
    index_space='ip',
    adam_epsilon=1e-6,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-4,
    group_by_cluster=True,
    num_clustering_warmup_epochs=10,
    num_cluster_update_epochs=5,
    num_cluster_size_update_epochs=25,
    clustering_type='EXPO',
    minimum_cluster_size=2,
    maximum_cluster_size=1600,
    target_indices_key='plbl2data_idx',
    target_pointer_key='plbl2data_data2ptr',
    use_distributional_representation=True,
    use_encoder_parallel=True,
    max_grad_norm=None,
    fp16=True,
)

In [None]:
#| export
metric = PrecRecl(block.n_lbl, block.test.data_lbl_filterer, prop=block.train.dset.data.data_lbl,
                  pk=10, rk=200, rep_pk=[1, 3, 5, 10], rep_rk=[10, 100, 200])

In [None]:
#| export
bsz = max(args.per_device_train_batch_size, args.per_device_eval_batch_size)*torch.cuda.device_count()

model = DBT012.from_pretrained('sentence-transformers/msmarco-distilbert-base-v4', margin=0.01, tau=10,
                               n_negatives=10, apply_softmax=True, use_encoder_parallel=True)
model.init_dr_head()

Some weights of DBT012 were not initialized from the model checkpoint at sentence-transformers/msmarco-distilbert-base-v4 and are newly initialized: ['encoder.dr_layer_norm.bias', 'encoder.dr_layer_norm.weight', 'encoder.dr_projector.bias', 'encoder.dr_projector.weight', 'encoder.dr_transform.bias', 'encoder.dr_transform.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#| export
learn = XCLearner(
    model=model, 
    args=args,
    train_dataset=block.train.dset,
    eval_dataset=block.test.dset,
    data_collator=block.collator,
    compute_metrics=metric,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
learn.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


  0%|          | 0/347 [00:00<?, ?it/s]

In [None]:
#| export
if __name__ == '__main__':
    mp.freeze_support()
    learn.train()

## Prediction

In [None]:
args = XCLearningArguments(
    output_dir='/home/scai/phd/aiz218323/scratch/outputs/64-ngame-ep-for-wikiseealso-with-entropy-loss-1-0',
    logging_first_step=True,
    per_device_train_batch_size=800,
    per_device_eval_batch_size=800,
    representation_num_beams=200,
    representation_accumulation_steps=100,
    predict_with_representation=True,
    representation_search_type='BRUTEFORCE',
    target_indices_key='plbl2data_idx',
    target_pointer_key='plbl2data_data2ptr',
    use_encoder_parallel=True,
    fp16=True,
)

In [None]:
output_dir = f"/home/scai/phd/aiz218323/scratch/outputs/{os.path.basename(args.output_dir)}"
mname = f'{output_dir}/{os.path.basename(get_best_model(output_dir))}'

In [None]:
bsz = max(args.per_device_train_batch_size, args.per_device_eval_batch_size)*torch.cuda.device_count()

model = DBT012.from_pretrained('sentence-transformers/msmarco-distilbert-base-v4', margin=0.01, tau=10, 
                               n_negatives=10, apply_softmax=True, use_encoder_parallel=True)

Some weights of DBT012 were not initialized from the model checkpoint at sentence-transformers/msmarco-distilbert-base-v4 and are newly initialized: ['encoder.dr_layer_norm.bias', 'encoder.dr_layer_norm.weight', 'encoder.dr_projector.bias', 'encoder.dr_projector.weight', 'encoder.dr_transform.bias', 'encoder.dr_transform.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from safetensors import safe_open

model_weight_file = f'{mname}/model.safetensors'

model_weights = {}
with safe_open(model_weight_file, framework="pt") as file:
    for k in file.keys(): model_weights[k] = file.get_tensor(k)
        

In [None]:
model.load_state_dict(model_weights, strict=False)

_IncompatibleKeys(missing_keys=['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.transformer.layer.

In [None]:
metric = PrecRecl(block.n_lbl, block.test.data_lbl_filterer, prop=block.train.dset.data.data_lbl,
                  pk=10, rk=200, rep_pk=[1, 3, 5, 10], rep_rk=[10, 100, 200])

In [None]:
metric = PrecRecl(block.n_lbl, block.train.data_lbl_filterer, prop=block.train.dset.data.data_lbl,
                  pk=10, rk=200, rep_pk=[1, 3, 5, 10], rep_rk=[10, 100, 200])

In [None]:
learn = XCLearner(
    model=model, 
    args=args,
    train_dataset=block.train.dset,
    eval_dataset=block.test.dset,
    data_collator=block.collator,
    compute_metrics=metric,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
o = learn.predict(block.test.dset)

  0%|          | 0/196 [00:00<?, ?it/s]

In [None]:
display_metric(o.metrics)

Unnamed: 0,P@1,P@3,P@5,P@10,N@1,N@3,N@5,N@10,PSP@1,PSP@3,PSP@5,PSP@10,PSN@1,PSN@3,PSN@5,PSN@10,R@10,R@100,R@200,loss,runtime,samples_per_second,steps_per_second
0,25.3939,16.998,13.0896,8.6169,25.3939,25.6131,26.8311,28.9641,21.3168,22.6493,24.5848,28.9135,21.3168,23.1711,24.7409,26.9363,34.507,53.3457,58.4375,0.0321,112.0889,1583.699,0.99


In [None]:
pred_dir = f"{mname}/predictions/"
os.makedirs(pred_dir, exist_ok=True)

with open(f'{pred_dir}/test_predictions.pkl', 'wb') as file: pickle.dump(o, file)

In [None]:
o = learn.predict(block.train.dset)

  0%|          | 0/196 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


In [None]:
display_metric(o.metrics)

Unnamed: 0,P@1,P@3,P@5,P@10,N@1,N@3,N@5,N@10,PSP@1,PSP@3,PSP@5,PSP@10,PSN@1,PSN@3,PSN@5,PSN@10,R@10,R@100,R@200,loss,runtime,samples_per_second,steps_per_second
0,63.7769,35.9293,25.4845,15.015,63.7769,68.7058,71.5651,74.504,56.0374,61.9516,67.3829,76.2451,56.0374,64.1684,67.8776,71.3049,85.601,93.6158,93.9624,0.0003,487.8279,1420.751,0.89


In [None]:
pred_dir = f"{mname}/predictions/"
os.makedirs(pred_dir, exist_ok=True)

with open(f'{pred_dir}/train_predictions.pkl', 'wb') as file: pickle.dump(o, file)

In [None]:
dataloader = learn.get_test_dataloader(block.test.dset.data_dset)
data_repr = learn.get_representation(dataloader, representation_attribute='data_repr', to_cpu=False)

  0%|          | 0/111 [00:00<?, ?it/s]

In [None]:
dataloader = learn.get_test_dataloader(block.test.dset.lbl_dset)
lbl_repr = learn.get_representation(dataloader, representation_attribute='data_repr', to_cpu=False)

  0%|          | 0/196 [00:00<?, ?it/s]

In [None]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from scipy import sparse
import xclib.evaluation.xc_metrics as xc_metrics
import torch.nn.functional as F

In [None]:
dl = DataLoader(data_repr, batch_size=64, shuffle=False)

In [None]:
data, indices = None, None

for b in tqdm(dl, total=len(dl)):
    sc, idx = torch.topk(b@lbl_repr.T, k=200, dim=1)
    sc, idx = sc.to('cpu'), idx.to('cpu')
    data = sc if data is None else torch.vstack([data, sc])
    indices = idx if indices is None else torch.vstack([indices, idx])
indptr = torch.arange(0, 200*(data.shape[0]+1), 200)

preds = sparse.csr_matrix((data.flatten(), indices.flatten(), indptr), shape=(data_repr.shape[0], block.n_lbl))

  0%|          | 0/2774 [00:00<?, ?it/s]

In [None]:
# F.normalize(b, dim=1)@F.normalize(lbl_repr, dim=1).T
xc_metrics.precision(Filterer.apply(preds, block.test.data_lbl_filterer), block.test.dset.data.data_lbl)

array([0.25393347, 0.20215193, 0.16999127, 0.14743261, 0.13089373])

In [None]:
# F.normalize(b.exp(), dim=1)@F.normalize(lbl_repr, dim=1).T
xc_metrics.precision(Filterer.apply(preds, block.test.data_lbl_filterer), block.test.dset.data.data_lbl)

array([0.24737064, 0.19481171, 0.16283507, 0.14079233, 0.12457313])

In [None]:
# b.exp()@lbl_repr
xc_metrics.precision(Filterer.apply(preds, block.test.data_lbl_filterer), block.test.dset.data.data_lbl)

array([0.24907754, 0.19628482, 0.16382465, 0.14101766, 0.12455511])

In [None]:
# b@lbl_repr
xc_metrics.precision(Filterer.apply(preds, block.test.data_lbl_filterer), block.test.dset.data.data_lbl)

array([1.12666535e-05, 5.63332676e-06, 2.21577519e-04, 1.80266456e-04,
       1.45339830e-04])

## Fusion

In [None]:
dirname = '/home/scai/phd/aiz218323/scratch/outputs/64-ngame-ep-for-wikiseealso-with-entropy-loss-1-0'

In [None]:
dirname = '/home/scai/phd/aiz218323/scratch/outputs/59-ngame-ep-for-wikiseealso-with-cls-for-dr-1-1'

In [None]:
output_dir = f"/home/scai/phd/aiz218323/scratch/outputs/{os.path.basename(dirname)}"
mname = f'{output_dir}/{os.path.basename(get_best_model(output_dir))}'

In [None]:
mname

'/home/scai/phd/aiz218323/scratch/outputs/59-ngame-ep-for-wikiseealso-with-cls-for-dr-1-1/checkpoint-130200'

In [None]:
pred_dir = f'{mname}/predictions/'

with open(f'{pred_dir}/train_predictions.pkl', 'rb') as f: train_pred = pickle.load(f)
    
with open(f'{pred_dir}/test_predictions.pkl', 'rb') as f: test_pred = pickle.load(f)

In [None]:
def get_sparse_matrix(o, n_lbl):
    indptr = torch.cat([torch.tensor([0]), o.pred_ptr.cumsum(dim=0)])
    return sparse.csr_matrix((o.pred_score, o.pred_idx, indptr), shape=(len(o.pred_ptr), n_lbl))
    

In [None]:
preds = {}

In [None]:
preds['entropy'] = (get_sparse_matrix(train_pred, block.n_lbl), get_sparse_matrix(test_pred, block.n_lbl))

In [None]:
preds['ngame'] = (get_sparse_matrix(train_pred, block.n_lbl), get_sparse_matrix(test_pred, block.n_lbl))

In [None]:
prop = xc_metrics.compute_inv_propesity(block.train.dset.data.data_lbl, A=0.55, B=1.5)
fuser = ScoreFusion(prop)

In [None]:
fuser.fit(preds['entropy'][0], preds['ngame'][0], block.train.dset.data.data_lbl, n_samples=20_000)

In [None]:
pred = fuser.predict(preds['entropy'][1], preds['ngame'][1], beta=0.1)

In [None]:
output = {
    'targ_idx': torch.tensor(block.test.dset.data.data_lbl.indices),
    'targ_ptr': torch.tensor([q-p for p,q in zip(block.test.dset.data.data_lbl.indptr, block.test.dset.data.data_lbl.indptr[1:])]),
    'pred_idx': torch.tensor(pred.indices),
    'pred_ptr': torch.tensor([q-p for p,q in zip(pred.indptr, pred.indptr[1:])]),
    'pred_score': torch.tensor(pred.data),
}

In [None]:
metric = PrecRecl(block.n_lbl, block.test.data_lbl_filterer, prop=block.train.dset.data.data_lbl,
                  pk=10, rk=200, rep_pk=[1, 3, 5, 10], rep_rk=[10, 100, 200])
m = metric(**output)
display_metric(m, remove_prefix=False)

  self._set_arrayXarray(i, j, x)


Unnamed: 0,P@1,P@3,P@5,P@10,N@1,N@3,N@5,N@10,PSP@1,PSP@3,PSP@5,PSP@10,PSN@1,PSN@3,PSN@5,PSN@10,R@10,R@100,R@200
0,31.5878,21.1413,16.0202,10.1679,31.5878,31.453,32.5368,34.4779,25.0819,27.3309,29.5955,34.0623,25.0819,27.657,29.4271,31.6832,39.482,53.4453,55.33
