# 以Transformers套件實作填漏字(Masked Language Modeling)功能

In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = pipeline("fill-mask")

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
(…)tilroberta-base/resolve/main/config.json: 100%|████████████████████████████████| 480/480 [00:00<00:00, 93.7kB/s]
model.safetensors: 100%|████████████████████████████████████████████████████████| 331M/331M [00:05<00:00, 56.1MB/s]
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (init

In [3]:
from pprint import pprint
pprint(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} " + \
           "that the community uses to solve NLP tasks."))

[{'score': 0.179274782538414,
  'sequence': 'HuggingFace is creating a tool that the community uses to solve '
              'NLP tasks.',
  'token': 3944,
  'token_str': ' tool'},
 {'score': 0.11349327117204666,
  'sequence': 'HuggingFace is creating a framework that the community uses to '
              'solve NLP tasks.',
  'token': 7208,
  'token_str': ' framework'},
 {'score': 0.052435003221035004,
  'sequence': 'HuggingFace is creating a library that the community uses to '
              'solve NLP tasks.',
  'token': 5560,
  'token_str': ' library'},
 {'score': 0.03493543714284897,
  'sequence': 'HuggingFace is creating a database that the community uses to '
              'solve NLP tasks.',
  'token': 8503,
  'token_str': ' database'},
 {'score': 0.028602207079529762,
  'sequence': 'HuggingFace is creating a prototype that the community uses to '
              'solve NLP tasks.',
  'token': 17715,
  'token_str': ' prototype'}]


# 結合Tokenizer

In [4]:
# 載入相關套件
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

# 結合分詞器(Tokenizer)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")

(…)cased/resolve/main/tokenizer_config.json: 100%|██████████████████████████████| 29.0/29.0 [00:00<00:00, 14.5kB/s]
(…)bert-base-cased/resolve/main/config.json: 100%|█████████████████████████████████| 465/465 [00:00<00:00, 233kB/s]
(…)ilbert-base-cased/resolve/main/vocab.txt: 100%|██████████████████████████████| 213k/213k [00:00<00:00, 17.1MB/s]
(…)t-base-cased/resolve/main/tokenizer.json: 100%|██████████████████████████████| 436k/436k [00:00<00:00, 15.3MB/s]
model.safetensors: 100%|████████████████████████████████████████████████████████| 263M/263M [00:04<00:00, 58.9MB/s]


# 推測答案

In [5]:
tokenizer.mask_token

'[MASK]'

In [6]:
sequence = f"Distilled models are smaller than the models they mimic. " + \
    f"Using them instead of the large versions would help {tokenizer.mask_token} " + \
    "our carbon footprint."
inputs = tokenizer(sequence, return_tensors="pt")
print("inputs: ", inputs)

inputs:  {'input_ids': tensor([[  101, 12120,  2050,  8683,  1181,  3584,  1132,  2964,  1190,  1103,
          3584,  1152, 27180,   119,  7993,  1172,  1939,  1104,  1103,  1415,
          3827,  1156,  1494,   103,  1412,  6302,  2555, 10988,   119,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]])}


In [7]:
tokenizer.mask_token_id

103

In [8]:
torch.where(inputs["input_ids"] == tokenizer.mask_token_id)

(tensor([0]), tensor([23]))

In [10]:
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
token_logits = model(**inputs).logits
token_logits

tensor([[[ -6.6732,  -6.6450,  -6.7923,  ...,  -5.5930,  -5.2783,  -5.6559],
         [ -6.3221,  -5.6379,  -5.8990,  ...,  -4.6864,  -4.1499,  -5.3507],
         [ -5.9863,  -6.0991,  -5.8089,  ...,  -5.2297,  -4.3015,  -6.5971],
         ...,
         [ -7.8892,  -7.6718,  -7.6357,  ...,  -6.9083,  -5.5853,  -6.2459],
         [-14.7710, -14.2714, -14.1642,  ..., -11.4770, -12.1692, -13.1041],
         [-14.3695, -13.9839, -13.6330,  ..., -11.2066, -11.6754, -12.7083]]],
       grad_fn=<ViewBackward0>)

In [13]:
token_logits.shape

torch.Size([1, 30, 28996])

In [12]:
len(token_logits[0])

30

In [14]:
mask_token_logits = token_logits[0, mask_token_index, :]
mask_token_logits

tensor([[-5.5502, -5.6790, -5.3256,  ..., -5.4807, -4.5107, -4.2441]],
       grad_fn=<IndexBackward0>)

In [15]:
mask_token_logits.shape

torch.Size([1, 28996])

In [None]:
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
    print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))