In [1]:
import torch
import hydra
import hydra.experimental
from exbert_model import ExBertForMaskedLM, ExBertModel
from model import PretrainModel

In [2]:
hydra.experimental.initialize(config_dir="config")

In [3]:
config = hydra.experimental.compose(config_file="config.yaml")

In [4]:
print(config.pretty())

logging:
  color: true
  iterations_interval: -1
  level: INFO
  log_dir: logs
  seconds_interval: 2
model:
  attention_probs_dropout_prob: 0.0
  embedding_std: 0.1
  gradient_checkpointing: false
  hidden_act: h_swish
  hidden_dropout_prob: 0.0
  hidden_size: 768
  intermediate_size: 3072
  layer_norm_eps: 1.0e-05
  max_position_embeddings: 515
  num_attention_heads: 12
  num_hidden_layers: 12
  pad_token_id: 1
  persistent_mem_size: 64
  vocab_size: 50265
saving:
  iterations_interval: -1
  keep_checkpoint_every_num_seconds: 3600
  num_checkpoints_to_keep: 2
  seconds_interval: 100
training:
  batch_size: 16
  fp16: true
  fp16_opt_level: O1
  gradient_accumulation_steps: 1
  learning_rate: 2.0e-05
  max_gradient_norm: -1.0
  num_gpus_per_node: 1
  optimizer: AdamW
  random_seed: 123
  resume: true
  scheduler: WarmupConstant
  total_num_epochs: 3
  total_num_iterations: 1000
  validation_iterations_interval: -1
  warmup_steps: 30



In [5]:
config.model.pad_token_id

1

In [6]:
model = PretrainModel(config)

In [7]:
config.model

{'max_position_embeddings': 515, 'vocab_size': 50265, 'persistent_mem_size': 64, 'num_attention_heads': 12, 'num_hidden_layers': 12, 'hidden_size': 768, 'intermediate_size': 3072, 'hidden_dropout_prob': 0.0, 'attention_probs_dropout_prob': 0.0, 'gradient_checkpointing': False, 'hidden_act': 'h_swish', 'layer_norm_eps': 1e-05, 'embedding_std': 0.1, 'pad_token_id': 1}

In [8]:
input_ids = torch.ones(2, 100).long()
mask_lm_labels = torch.ones(2, 100).long()

batch = {
    "input_ids": input_ids,
    "labels": mask_lm_labels
}

In [9]:
model(batch)

{'outputs': tensor([[[ 0.3242,  0.9273, -0.4952,  ..., -0.1322,  0.1280, -0.2398],
          [ 0.3176,  0.9501, -0.4526,  ..., -0.0296,  0.1855, -0.1860],
          [ 0.2493,  0.9993, -0.5639,  ..., -0.3046,  0.2726, -0.2184],
          ...,
          [ 0.3618,  0.9772, -0.4293,  ..., -0.1609,  0.0399, -0.2679],
          [ 0.2701,  0.9869, -0.4367,  ..., -0.2182,  0.1598, -0.1205],
          [ 0.3084,  0.8121, -0.5741,  ..., -0.1791,  0.1653, -0.2618]],
 
         [[ 0.3242,  0.9273, -0.4952,  ..., -0.1322,  0.1280, -0.2398],
          [ 0.3176,  0.9501, -0.4526,  ..., -0.0296,  0.1855, -0.1860],
          [ 0.2493,  0.9993, -0.5639,  ..., -0.3046,  0.2726, -0.2184],
          ...,
          [ 0.3618,  0.9772, -0.4293,  ..., -0.1609,  0.0399, -0.2679],
          [ 0.2701,  0.9869, -0.4367,  ..., -0.2182,  0.1598, -0.1205],
          [ 0.3084,  0.8121, -0.5741,  ..., -0.1791,  0.1653, -0.2618]]],
        grad_fn=<UnsafeViewBackward>),
 'loss': tensor(9.9503, grad_fn=<NllLossBackward>)}

In [10]:
from transformers import AutoTokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [12]:
len(tokenizer)

50265

In [14]:
tokenizer.mask_token_id

50264

In [16]:
tokenizer.get_special_tokens_mask()

TypeError: get_special_tokens_mask() missing 1 required positional argument: 'token_ids_0'

In [17]:
2 / 0.00003

66666.66666666667

In [19]:
66666 / 3600 * 16

296.29333333333335

In [20]:
296 / 24

12.333333333333334

In [21]:
2.71**7

1073.4579485248778