## 0. Set envs & Load DB, Model and Tokenizer

### Hyperparameter configuration

In [4]:
import easydict
import subprocess
import json
import os
# ======================= CONFIG ==================== #
## GPU setting
os.environ["CUDA_VISIBLE_DEVICES"] = '7'
## TASK & DB
TASK_NAME = 'binary_retrieval'
DB = 'dx,prx'
DB_size = 1000
MODEL_TYPE = 'both'
Unified = False
Align = False
Relation_Classification = True
Scratch_Downstream = False
## Important Model Config
Dim_Hidden = 128
NUM_Layers = {'lang':2, 'kg':2, 'cross':4}
Dropout = 0.1
Num_Negatives = 1
Margin = 1.0
# ======================= CONFIG ==================== #
# Variables
Var_TASK = {}
Var_MODEL = {'both':'KGenc_LMinit', 'lm':'LMinit', 'kg':'KGenc', 'rand':'Randinit'}
Var_Unified = 'Unified' if Unified else ''
Var_Align = 'Align_' if Align else ''
Var_RC = 'RC_' if Relation_Classification else ''
assert MODEL_TYPE in Var_MODEL, "Model not supported"
assert DB in ['px','dx,prx'], "DB not supported"
assert TASK_NAME in ['pretrain','binary_retrieval'], "Task not supported"
if Scratch_Downstream is True:
    assert Align is False and Relation_Classification is False, "Scratch start downstream task must turn off alignment prediction & relation classification"

# Model Name
## <LMinit & KGenc> : both, <LMinit only> : lm, <KGenc only> : kg, <RandomInit> : rand
## Unified(Placeholder) for Abstract Node : True or False
MODEL_NAME = f'{DB}_{Var_Unified}{"Uni" if MODEL_TYPE in ["both","kg"] else "No"}KGenc'
RUN_NAME = f'{Var_MODEL[MODEL_TYPE]}_H{Dim_Hidden}_L{NUM_Layers["lang"]},{NUM_Layers["kg"]},{NUM_Layers["cross"]}_{Var_Align}{Var_RC}{Var_Unified}{DB}{DB_size}'

# Paths
EXP_PATH = os.path.dirname(os.getcwd())
# Essential Hyperparameters
args = easydict.EasyDict({
    "seed":1234,
    "model_type":"lxmert",
    "do_train": True,
    "evaluate_during_training": True,
    "do_eval": True,
    "edge_cls": Relation_Classification,
    "align": Align,
    "n_negatives": Num_Negatives,
    "prediction_loss_only":False,
    "overwrite_output_dir":False,
    "mlm_probability": 0.15,
    "block_size": 512,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "learning_rate": 1e-4,
    "num_train_epochs": 50,
    "num_log_per_epoch": 20,
    "num_save_per_epoch": 1,
    "num_eval_per_epoch": 1,
    "task" : TASK_NAME,
    "train_data_file":os.path.join(EXP_PATH,f"data/{DB}_{DB_size}/{MODEL_NAME}/train"),
    "train_data_files":None,
    "eval_data_file": os.path.join(EXP_PATH,f"data/{DB}_{DB_size}/{MODEL_NAME}/valid"),
    #"test_data_file": os.path.join(EXP_PATH, "data/{}/test".format(MODEL_NAME)),
    "run_name":f"{TASK_NAME}_{RUN_NAME}"
})
if (TASK_NAME == 'pretrain') or Scratch_Downstream:
    if Scratch_Downstream:
        SRC_PATH = os.path.join(EXP_PATH, 'src/run_downstream.py')
        args['output_dir'] = os.path.join(EXP_PATH,f"pretrained_models/scratch_{TASK_NAME}_{RUN_NAME}")
    else:
        SRC_PATH = os.path.join(EXP_PATH, 'src/run_pretraining.py')
        args['output_dir'] = os.path.join(EXP_PATH,f"pretrained_models/{TASK_NAME}_{RUN_NAME}")
    args['tokenizer_name'] = "bert-base-uncased"
    args['config_name'] = os.path.join(EXP_PATH, f"config/config_H{Dim_Hidden}_L{NUM_Layers['lang']},{NUM_Layers['kg']},{NUM_Layers['cross']}_{MODEL_TYPE}_{Var_Unified}{DB}.json")
    with open(os.path.join(EXP_PATH, f"config/config_{Var_Unified}{DB}.json")) as f:
        Config = json.load(f)
    Config['gcn'] = MODEL_TYPE in ['both', 'kg']
    Config['pretrained_lang_model']['use_weight'] = MODEL_TYPE in ['both', 'lm']
    Config['hidden_size'] = Dim_Hidden
    Config['intermediate_size'] = 4*Dim_Hidden
    Config['pretrained_lang_model']['model_name'] = "prajjwal1/bert-{}".format('tiny' if Dim_Hidden==128 else 'mini')
    Config['l_layers'], Config['r_layers'], Config['x_layers'] = (NUM_Layers['kg'], NUM_Layers['lang'], NUM_Layers['cross'])
    Config['token_type_vocab'] = {header:idx for idx, header in enumerate(DB.split(','))}
    Config['type_vocab_size']['lang'] = len(Config['token_type_vocab'])
    Config['margin'] = Margin
    Config['attention_probs_dropout_prob'] = Dropout
    Config['hidden_dropout_prob'] = Dropout
    with open(args['config_name'],'w') as g:
        json.dump(Config,g)

else:
    args['tokenizer_name'] = "bert-base-uncased"
    SRC_PATH = os.path.join(EXP_PATH, 'src/run_downstream.py')
    args['model_name_or_path'] = os.path.join(EXP_PATH, f'pretrained_models/pretrain_{RUN_NAME}')
    with open(f"{args['model_name_or_path']}/config.json") as f:
        Config = json.load(f)
    Config['margin'] = Margin
    Config['attention_probs_dropout_prob'] = Dropout
    Config['hidden_dropout_prob'] = Dropout
    with open(f"{args['model_name_or_path']}/config.json",'w') as g:
        json.dump(Config,g)
    args['output_dir'] = os.path.join(EXP_PATH,f"pretrained_models/{TASK_NAME}_{RUN_NAME}")



args_LIST = list()
for (k,v) in list(args.items()):
    if (isinstance(v, bool)):
        if v:
            args_LIST.append("--{}".format(k))
    else:
        args_LIST.append("--{}={}".format(k,v))


### Environment settings

In [5]:
# Base packages
import logging
import math
from dataclasses import dataclass, field
from glob import glob
from typing import Optional
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import ConcatDataset
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.sampler import RandomSampler, SequentialSampler

# Own implementation
from utils.parameters import parser
from utils.dataset import get_dataset
from utils.data_collator import NodeClassification_DataCollator, NegativeSampling_DataCollator
from trainer import Trainer
from model import LxmertForKGTokPredAndMaskedLM, LxmertForRanking

# From Huggingface transformers package
from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    LxmertConfig,
    LxmertTokenizer,
    PreTrainedTokenizer,
    # Trainer,
    set_seed,
)

# Set enviroments
set_seed(args.seed)

### Load tokenizer

In [6]:
tokenizer = LxmertTokenizer.from_pretrained(args.tokenizer_name, cache_dir=None)
## Sanity check
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize('Just for sanity check [CLS] [SEP] [MASK] [PAD]')))

[2074, 2005, 20039, 4638, 101, 102, 103, 0]


### Load pretrained model

In [7]:
# Load configuration
config = LxmertConfig.from_pretrained(args.model_name_or_path)

try:
    model = LxmertForRanking.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
    )
except:
    ckpt_path = os.path.join(args.model_name_or_path, 'pytorch_model.bin')
    load_model_dict = torch.load(ckpt_path)
    modified_model_dict = load_model_dict.copy()
    for param in load_model_dict:
        if 'pooler' in param:
            modified_model_dict.pop(param)
    torch.save(modified_model_dict, ckpt_path)

    model = LxmertForRanking.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
    )
model.eval()

## Sanity check
print('='*100)
print(config)
print('='*100)
print(model)
print('='*100)

Some weights of the model checkpoint at /workspace/experiments/kg_txt_multimodal/lxmert/pretrained_models/pretrain_KGenc_LMinit_H128_L2,2,4_RC_dx,prx1000 were not used when initializing LxmertForRanking: ['classifier.weight', 'classifier.bias', 'edge_classifier.0.weight', 'edge_classifier.0.bias', 'edge_classifier.2.weight', 'edge_classifier.2.bias', 'lm_head.predictions.bias', 'lm_head.predictions.transform.dense.weight', 'lm_head.predictions.transform.dense.bias', 'lm_head.predictions.transform.LayerNorm.weight', 'lm_head.predictions.transform.LayerNorm.bias', 'lm_head.predictions.decoder.weight']
- This IS expected if you are initializing LxmertForRanking from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LxmertForRanking from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

LxmertConfig {
  "_name_or_path": "/workspace/experiments/kg_txt_multimodal/lxmert/pretrained_models/pretrain_KGenc_LMinit_H128_L2,2,4_RC_dx,prx1000",
  "architectures": [
    "LxmertForKGTokPredAndMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "contrastive_learning": false,
  "gcn": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "kg_special_token_ids": {
    "CLS": 2,
    "MASK": 1,
    "PAD": 0
  },
  "l_layers": 2,
  "layer_norm_eps": 1e-12,
  "margin": 1.0,
  "max_position_embeddings": {
    "kg": 0,
    "lang": 512
  },
  "model_type": "lxmert",
  "negative_samples": 0,
  "num_attention_heads": 4,
  "num_attr_labels": 400,
  "num_hidden_layers": {
    "cross_encoder": 4,
    "language": 2,
    "vision": 2
  },
  "num_kg_labels": 8049,
  "num_object_labels": 1600,
  "num_qa_labels": 9500,
  "num_relations": 7,
  "pretrained_kg_embedding": "",
  "pretrained_lang_model": {
    "mode

## 1. Class-wise accuracy

### Dataloader with masking

In [8]:
if args.block_size <= 0:
    args.block_size = tokenizer.max_len
    # Our input block size will be the max possible for the model
else:
    args.block_size = min(args.block_size, tokenizer.max_len)

# Get datasets
train_dataset = get_dataset(args, tokenizer=tokenizer)
eval_dataset = get_dataset(args, tokenizer=tokenizer, evaluate=True)

train_data_collator = NodeClassification_DataCollator(tokenizer=tokenizer,
                                                align=True,
                                                n_negatives= args.n_negatives,
                                                edge_cls=False,
                                                kg_special_token_ids=config.kg_special_token_ids,
                                                kg_size=config.vocab_size['kg'])
eval_data_collator = NodeClassification_DataCollator(tokenizer=tokenizer,
                                                align=True,
                                                edge_cls=False,
                                                kg_special_token_ids=config.kg_special_token_ids,
                                                kg_size=config.vocab_size['kg'])
def _get_train_sampler() -> Optional[torch.utils.data.sampler.Sampler]:
    return (
        RandomSampler(train_dataset)
    )

def get_train_dataloader()-> DataLoader:
    """
    Returns the training :class:`~torch.utils.data.DataLoader`.
    Will use no sampler if :obj:`train_dataset` does not implement :obj:`__len__`, a random sampler (adapted
    to distributed training if necessary) otherwise.
    Subclass and override this method if you want to inject some custom behavior.
    """
    if train_dataset is None:
        raise ValueError("Trainer: training requires a train_dataset.")
    train_sampler = _get_train_sampler()
    return DataLoader(
        train_dataset,
        batch_size=2,
        sampler=train_sampler,
        collate_fn=train_data_collator,
        drop_last=True,
        pin_memory=True
    )

def _get_eval_sampler(eval_dataset) -> Optional[torch.utils.data.sampler.Sampler]:
    return SequentialSampler(eval_dataset)

def get_eval_dataloader(eval_dataset = None) -> DataLoader:
    """
    Returns the evaluation :class:`~torch.utils.data.DataLoader`.
    Subclass and override this method if you want to inject some custom behavior.
    Args:
        eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
            If provided, will override :obj:`eval_dataset`. If it is an :obj:`datasets.Dataset`, columns not
            accepted by the ``model.forward()`` method are automatically removed. It must implement :obj:`__len__`.
    """
    if eval_dataset is None and eval_dataset is None:
        raise ValueError("Trainer: evaluation requires an eval_dataset.")
    eval_dataset = eval_dataset if eval_dataset is not None else eval_dataset
    eval_sampler = _get_eval_sampler(eval_dataset)

    return DataLoader(
        eval_dataset,
        sampler=eval_sampler,
        batch_size=2,
        collate_fn=eval_data_collator,
        drop_last=True,
        pin_memory=True,
    )

def get_test_dataloader(test_dataset) -> DataLoader:
    """
    Returns the test :class:`~torch.utils.data.DataLoader`.
    Subclass and override this method if you want to inject some custom behavior.
    Args:
        test_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
            The test dataset to use. If it is an :obj:`datasets.Dataset`, columns not accepted by the
            ``model.forward()`` method are automatically removed. It must implement :obj:`__len__`.
    """
    if test_dataset is None and test_dataset is None:
        raise ValueError("Trainer: evaluation requires an test_dataset.")
    test_dataset = test_dataset if test_dataset is not None else test_dataset
    test_sampler = _get_eval_sampler(test_dataset)

    return DataLoader(
        eval_dataset,
        sampler=eval_sampler,
        batch_size=2,
        collate_fn=eval_data_collator,
        drop_last=True,
        pin_memory=True,
    )
train_data_loader = get_train_dataloader()
eval_data_loader = get_eval_dataloader(eval_dataset)
# Sanity check




In [9]:
print(next(iter(train_data_loader)))

  ignore_masking = ~torch.tensor(entity_mask, dtype=torch.bool)


{'lang_input_ids': tensor([[ 101, 4003, 3695,  ...,    0,    0,    0],
        [ 101,  103, 1012,  ...,    0,    0,    0],
        [ 101,  103, 1012,  ...,    0,    0,    0],
        [ 101, 4003, 3695,  ...,    0,    0,    0]]), 'kg_input_ids': tensor([[     2,  36009,  36010, 164190, 206041, 343214,  37361, 473384, 185787,
         168771, 606248, 217898, 386745, 593782,    713,    471,     60,  21943,
           3496,   1507,    497,   6228,   2273,   1468,    169,     22, 552906,
              1, 157038, 600807, 592072, 588665, 600411, 328532,      1, 618603,
         493589, 623245,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,  

In [10]:
print(next(iter(eval_data_loader)))

{'lang_input_ids': tensor([[  101, 23760, 25808,  ...,     0,     0,     0],
        [  101,  1015,  1012,  ...,     0,     0,     0],
        [  101,  1015,  1012,  ...,     0,     0,     0],
        [  101, 23760, 25808,  ...,     0,     0,     0]]), 'kg_input_ids': tensor([[     2,  22928,  22929,  33268, 161104, 243366, 393860,  55343, 221158,
         541317, 603893, 377981, 300583, 620782, 630173, 357330,  58402, 201614,
           3013,   4828,   2445,   1281, 166579,    130,    439,   1711,  28029,
           3933, 357331, 392415, 447709, 530239, 610485, 622843, 516528, 611962,
         496790, 587127, 203431, 620609, 509149,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
  

In [11]:
test = next(iter(train_data_loader))
for k,v in test.items():
    print(k, v.size())

lang_input_ids torch.Size([4, 512])
kg_input_ids torch.Size([4, 240])
lang_attention_mask torch.Size([4, 512])
kg_attention_mask torch.Size([4, 240, 240])
kg_label_mask torch.Size([4, 240])
kg_label torch.Size([4, 240])
token_type_ids torch.Size([4, 512])
kg_padding_mask torch.Size([4, 240])
lm_label torch.Size([4, 512])
cross_label torch.Size([4])


In [12]:
test = next(iter(eval_data_loader))
for k,v in test.items():
    print(k, v.size())

lang_input_ids torch.Size([4, 512])
kg_input_ids torch.Size([4, 240])
lang_attention_mask torch.Size([4, 512])
kg_attention_mask torch.Size([4, 240, 240])
kg_label_mask torch.Size([4, 240])
kg_label torch.Size([4, 240])
token_type_ids torch.Size([4, 512])
kg_padding_mask torch.Size([4, 240])
lm_label torch.Size([4, 512])
cross_label torch.Size([4])
