In [2]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM
import torch

cfg = AutoConfig.from_pretrained(".", trust_remote_code=True)
print("Config:", cfg)

  from .autonotebook import tqdm as notebook_tqdm


Config: ModelConfig {
  "architectures": [
    "BertModel",
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "auto_map": {
    "AutoConfig": "model_configuration.ModelConfig",
    "AutoModel": "modeling.BertModel",
    "AutoModelForMaskedLM": "modeling.BertForMaskedLM"
  },
  "gradient_checkpointing": false,
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "intermediate_size": 1280,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 512,
  "model_type": "my-bert",
  "num_attention_heads": 6,
  "num_hidden_layers": 12,
  "position_bucket_size": 32,
  "transformers_version": "4.50.0",
  "vocab_size": 8192
}



In [3]:
tok = AutoTokenizer.from_pretrained(".", trust_remote_code=True)
print("Tokenizer:", tok)
print("Vocab size:", tok.vocab_size)
print("Mask token:", tok.mask_token, "→ ID", tok.mask_token_id)

Tokenizer: PreTrainedTokenizerFast(name_or_path='.', vocab_size=8192, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['<special_0>', '<special_1>', '<special_2>', '<special_3>', '<special_4>', '<special_5>', '<special_6>', '<special_7>', '<special_8>', '<special_9>', '<special_10>', '<special_11>', '<special_12>', '<special_13>', '<special_14>', '<special_15>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, norma

In [19]:
model = AutoModelForMaskedLM.from_pretrained(".", trust_remote_code=True)
model.eval()

BertForMaskedLM(
  (embedding): Embedding(
    (word_embedding): Embedding(8192, 384)
    (word_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=False)
    (dropout): Dropout(p=0.1, inplace=False)
    (relative_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
  )
  (transformer): Encoder(
    (attention_layers): ModuleList(
      (0-11): 12 x Attention(
        (in_proj_qk): Linear(in_features=384, out_features=768, bias=True)
        (in_proj_vg): Linear(in_features=384, out_features=768, bias=True)
        (out_proj): Linear(in_features=384, out_features=384, bias=True)
        (pre_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=False)
        (post_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (mlp_layers): ModuleList(
      (0-11): 12 x FeedForward(
        (mlp): Sequential(
          (0): LayerNorm((384,), eps=1e-05, elementwise_affine=False)
      

In [48]:
text = "The capital of <mask> is Ottawa."
inputs = tok(text, return_tensors="pt")
print("Input IDs:", inputs["input_ids"])
print("Tokenized:", tok.convert_ids_to_tokens(inputs["input_ids"][0]))

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
print("Logits shape:", logits.shape)

Input IDs: tensor([[   1,  437, 5003,  307,  236,    4,  236,  353,  527,  449,  303, 1393,
           29]])
Tokenized: ['<s>', 'ĠThe', 'Ġcapital', 'Ġof', 'Ġ', '<mask>', 'Ġ', 'Ġis', 'ĠO', 'nt', 'ar', 'io', '.']
Logits shape: torch.Size([1, 13, 8192])


In [49]:
mask_token_id = tok.mask_token_id
mask_index = (inputs["input_ids"][0] == mask_token_id).nonzero(as_tuple=True)[0].item()

mask_logits = outputs.logits[0, mask_index]
topk = torch.topk(mask_logits, k=5)
top_ids = topk.indices.tolist()
top_scores = topk.values.tolist()

top_tokens = tok.convert_ids_to_tokens(top_ids)

print("Top 5 predictions for <mask>:")
for token, score in zip(top_tokens, top_scores):
    print(f"  {token:>10s}  (logit={score:.2f})")

Top 5 predictions for <mask>:
           Ġ  (logit=8.14)
     ĠFrance  (logit=4.00)
           .  (logit=3.69)
           Ģ  (logit=3.64)
           ,  (logit=3.53)
