In [24]:
import torch
from utils import RobertaConfig
from model import RobertaForMaskedLM
from safetensors.torch import load_file
from transformers import RobertaTokenizerFast 

### Store Path to Weights ###
path_to_weights = "work_dir/RoBERTa_Pretraining/checkpoint_75000/model.safetensors"

### Load Model Config ###
config = RobertaConfig()
print(config)

### Load Tokenizer ###
tokenizer = RobertaTokenizerFast.from_pretrained(config.hf_model_name)


RobertaConfig(vocab_size=50265, start_token=0, end_token=2, pad_token=2, mask_token=50264, embedding_dimension=768, num_transformer_blocks=12, num_attention_heads=12, mlp_ratio=4, layer_norm_eps=1e-06, hidden_dropout_p=0.1, attention_dropout_p=0.1, context_length=512, masking_prob=0.15, hf_model_name='FacebookAI/roberta-base', pretrained_backbone='pretrained', path_to_pretrained_weights=None)


### Check Masked Language Model Task ###

Again, this model was only trained for about 75K steps, so it wont be as nearly good as the RoBERTa published from Facebook, but it should be able to produce something meaningful hopefully!

In [37]:
### Load Model ###
model = RobertaForMaskedLM(config)
model_weights = load_file(path_to_weights)
model.load_state_dict(model_weights)
model.eval()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=2)
      (position_embeddings): Embedding(512, 768)
      (layernorm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layers): ModuleList(
        (0-11): 12 x RobertaEncoderLayer(
          (attention): RobertaAttention(
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (feed_forward): RobertaFeedForward(
            (intermediate_dense): Linear(in_feat

In [87]:
### Inference Model ###
sample_sentence = "Lets play a <mask> of chess"
tokenized = torch.tensor(tokenizer(sample_sentence)["input_ids"])
mask_token_idx = (tokenized == tokenizer.mask_token_id).nonzero().squeeze().item()

with torch.inference_mode():
    hidden_states, preds = model(tokenized.unsqueeze(0))

values, idx = preds.squeeze()[mask_token_idx].topk(3)
preds = tokenizer.decode(idx)

print("Predictions:")
preds.split()


Predictions:


['game', 'world', 'king']