**0. Code for Colab Debugging**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My Drive/lxmert/src/
!pip install transformers
import torch
print(torch.cuda.is_available())

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/lxmert/src
True


**1. Import pckgs & Set basic configs**

In [2]:
# Base packages
import logging
import math
import os
from dataclasses import dataclass, field
from glob import glob
from typing import Optional
from torch.utils.data import ConcatDataset

# Own implementation
from utils.parameters import parser
from utils.dataset import get_dataset
from utils.data_collator import NodeMasking_DataCollator, NodeClassification_DataCollator, LiteralRegression_DataCollator
from model import LxmertForPreTraining,LxmertForKGTokPredAndMaskedLM

# From Huggingface transformers package
from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    LxmertConfig,
    LxmertTokenizer,
    PreTrainedTokenizer,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    set_seed,
)

train_args = TrainingArguments(output_dir='test',
    do_train=True,
    do_eval=False,
    local_rank=-1,
    per_device_train_batch_size=4,    
    learning_rate=1e-3,
    num_train_epochs=1)
import easydict 
PATH = '/content/gdrive/My Drive/lxmert/'
args = easydict.EasyDict({
    "model_type":"lxmert",
    "model_name_or_path":None,
    "cache_dir":None,
    "config_name":PATH+"config/config.json",
    "tokenizer_name":"bert-base-uncased",
    "train_data_file":PATH+"data/masked_literal_prediction/train",
    "train_data_files":None,
    "eval_data_file":PATH+"data/masked_literal_prediction/valid",
    "output_dir":PATH+"pretrained_models/test",
    "mlm":True,
    "mlm_probability":0.15,
    "block_size":512,
})

logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

#args, args, args = parser.parse_args_into_dataclasses()

if args.eval_data_file is None and args.do_eval:
    raise ValueError(
        "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
        "or remove the --do_eval argument."
    )
if (
    os.path.exists(args.output_dir)
    and os.listdir(args.output_dir)
    and args.do_train
    and not args.overwrite_output_dir
):
    raise ValueError(
        f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
    )

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%m/%d %H:%M",
    level=logging.INFO if train_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    train_args.local_rank,
    train_args.device,
    train_args.n_gpu,
    bool(train_args.local_rank != -1),
    train_args.fp16,
)
logger.info("Training/evaluation parameters %s", args)

# Set seed
set_seed(train_args.seed)

11/02 02:43 - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: False
11/02 02:43 - Training/evaluation parameters {'model_type': 'lxmert', 'model_name_or_path': None, 'cache_dir': None, 'config_name': '/content/gdrive/My Drive/lxmert/config/config.json', 'tokenizer_name': 'bert-base-uncased', 'train_data_file': '/content/gdrive/My Drive/lxmert/data/masked_literal_prediction/train', 'train_data_files': None, 'eval_data_file': '/content/gdrive/My Drive/lxmert/data/masked_literal_prediction/valid', 'output_dir': '/content/gdrive/My Drive/lxmert/pretrained_models/test', 'mlm': True, 'mlm_probability': 0.15, 'block_size': 512}


**2. Load model configuration**

In [3]:
if args.config_name:
    config = LxmertConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
elif args.model_name_or_path:
    config = LxmertConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
else:
    config = CONFIG_MAPPING[args.model_type]()
    logger.warning("You are instantiating a new config instance from scratch.")

**3. Define tokenizer (or load pretrained one)**

In [4]:
if args.tokenizer_name:
    tokenizer = LxmertTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
elif args.model_name_or_path:
    tokenizer = LxmertTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
else:
    raise ValueError(
        "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
        "and load it from here, using --tokenizer_name"
    )

**4. Define model (or load pretrained one)**

In [5]:
if args.model_name_or_path:
    model = LxmertForKGTokPredAndMaskedLM.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir,
    )
else:
    logger.info("Training new model from scratch")
    model = LxmertForKGTokPredAndMaskedLM(config)
if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
    raise ValueError(
        "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
        "--mlm flag (masked language modeling)."
    )

11/02 02:43 - Training new model from scratch


**5. Build dataset & data loader**

In [6]:
if args.block_size <= 0:
    args.block_size = tokenizer.max_len
    # Our input block size will be the max possible for the model
else:
    args.block_size = min(args.block_size, tokenizer.max_len)

# Get datasets

train_dataset = (
    get_dataset(args, tokenizer=tokenizer,kg_pad=config.kg_special_token_ids["PAD"]) if train_args.do_train else None
)
eval_dataset = (
    get_dataset(args, tokenizer=tokenizer, kg_pad=config.kg_special_token_ids["PAD"], evaluate=True)
    if train_args.do_eval
    else None
)
data_collator = NodeClassification_DataCollator(tokenizer=tokenizer, kg_special_token_ids=config.kg_special_token_ids, kg_size = config.vocab_size['kg'])

11/02 02:43 - Loading features from dataset file at /content/gdrive/My Drive/lxmert/data/masked_literal_prediction/train


**6. Initialize trainer & Run training**
> Use Huggingface [trainer.py](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py)





In [7]:
# Initialize our Trainer
print(train_args)
print(data_collator)
print(train_dataset)
trainer = Trainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    prediction_loss_only=True
)

# Training
if train_args.do_train:
    model_path = (
        args.model_name_or_path
        if args.model_name_or_path is not None and os.path.isdir(args.model_name_or_path)
        else None
    )
    trainer.train(model_path=model_path)
    trainer.save_model()
    # For convenience, we also re-save the tokenizer to the same directory,
    # so that you can share your model easily on huggingface.co/models =)
    if trainer.is_world_master():
        tokenizer.save_pretrained(args.output_dir)

TrainingArguments(output_dir='test', overwrite_output_dir=False, do_train=True, do_eval=False, do_predict=False, evaluate_during_training=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=0.001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1, max_steps=-1, warmup_steps=0, logging_dir='runs/Nov02_02-43-37_71126e65e1fb', logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name='test', disable_tqdm=False, remove_unused_columns=True, label_names=None, l

  probability_matrix.masked_fill_(torch.tensor(entity_mask, dtype=torch.bool), value=0.0)
  label_mask.masked_fill_(torch.tensor(entity_mask, dtype=torch.bool), value=False)


Step,Training Loss


KeyboardInterrupt: ignored