In [1]:
#!pip install transformers

In [2]:
#!pip install pipeline

In [4]:
from transformers import pipeline

In [6]:
nlp = pipeline("fill-mask")

No model was supplied, defaulted to distilroberta-base (https://huggingface.co/distilroberta-base)
All model checkpoint layers were used when initializing TFRobertaForMaskedLM.

All the layers of TFRobertaForMaskedLM were initialized from the model checkpoint at distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForMaskedLM for predictions without further training.


In [7]:
# 마크스 토큰
nlp.tokenizer.mask_token

'<mask>'

In [8]:
f'Pizza is my {nlp.tokenizer.mask_token}'

'Pizza is my <mask>'

# 1. 파이프라인에 문장을 입력

In [9]:
nlp(f'Pizza is my {nlp.tokenizer.mask_token}')

[{'score': 0.07068394124507904,
  'token': 2674,
  'token_str': ' favorite',
  'sequence': 'Pizza is my favorite'},
 {'score': 0.03550918772816658,
  'token': 4592,
  'token_str': ' lunch',
  'sequence': 'Pizza is my lunch'},
 {'score': 0.03522925823926926,
  'token': 8492,
  'token_str': ' cake',
  'sequence': 'Pizza is my cake'},
 {'score': 0.02607620321214199,
  'token': 9366,
  'token_str': ' pizza',
  'sequence': 'Pizza is my pizza'},
 {'score': 0.025679104030132294,
  'token': 17927,
  'token_str': ' dessert',
  'sequence': 'Pizza is my dessert'}]

In [21]:
nlp(f'Pizza is my {nlp.tokenizer.mask_token} my {nlp.tokenizer.mask_token} food')

[[{'score': 0.148834690451622,
   'token': 2674,
   'token_str': ' favorite',
   'sequence': '<s>Pizza is my favorite my<mask> food.</s>'},
  {'score': 0.06916295737028122,
   'token': 8,
   'token_str': ' and',
   'sequence': '<s>Pizza is my and my<mask> food.</s>'},
  {'score': 0.036127083003520966,
   'token': 1441,
   'token_str': ' friend',
   'sequence': '<s>Pizza is my friend my<mask> food.</s>'},
  {'score': 0.01511524710804224,
   'token': 3795,
   'token_str': ' mom',
   'sequence': '<s>Pizza is my mom my<mask> food.</s>'},
  {'score': 0.014074320904910564,
   'token': 5548,
   'token_str': ' favourite',
   'sequence': '<s>Pizza is my favourite my<mask> food.</s>'}],
 [{'score': 0.7035160660743713,
   'token': 2674,
   'token_str': ' favorite',
   'sequence': '<s>Pizza is my<mask> my favorite food.</s>'},
  {'score': 0.12073897570371628,
   'token': 5548,
   'token_str': ' favourite',
   'sequence': '<s>Pizza is my<mask> my favourite food.</s>'},
  {'score': 0.078287445008754

In [22]:
from transformers import TFAutoModelForMaskedLM, AutoTokenizer

In [23]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base")

All model checkpoint layers were used when initializing TFRobertaForMaskedLM.

All the layers of TFRobertaForMaskedLM were initialized from the model checkpoint at distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForMaskedLM for predictions without further training.


In [24]:
vocab = tokenizer.get_vocab()
id2word = {i: word for word, i in vocab.items()}

In [32]:
sequence = f"Pizza is my {tokenizer.mask_token} food."
sequence

'Pizza is my <mask> food.'

In [33]:
input_ids = tokenizer.encode(sequence, return_tensors="tf")
input_ids

<tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[    0,   510, 35280,    16,   127, 50264,   689,     4,     2]])>

In [34]:
tokenizer.mask_token_id

50264

In [35]:
input_ids[0] == tokenizer.mask_token_id

<tf.Tensor: shape=(9,), dtype=bool, numpy=array([False, False, False, False, False,  True, False, False, False])>

In [36]:
import tensorflow as tf

In [37]:
tf.where(input_ids[0] == tokenizer.mask_token_id)

<tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[5]], dtype=int64)>

In [38]:
mask_token_indices = tf.where(input_ids[0] == tokenizer.mask_token_id)[0].numpy().tolist()
mask_token_indices

[5]

In [39]:
result = model(input_ids)

In [40]:
logits = result[0]
logits

<tf.Tensor: shape=(1, 9, 50265), dtype=float32, numpy=
array([[[35.35766   , -3.8330357 , 18.556173  , ...,  3.12123   ,
          5.922853  , 12.715132  ],
        [ 1.9485795 , -5.0265136 , 14.836345  , ..., -1.1891124 ,
          0.7311702 ,  0.7832576 ],
        [ 2.0384846 , -4.953379  ,  5.4868183 , ..., -3.4435925 ,
         -0.87940323, -0.41273394],
        ...,
        [-3.3596616 , -6.239405  ,  5.7364025 , ..., -1.9424853 ,
         -1.668887  , -2.7464814 ],
        [18.7403    , -5.234599  , 19.331993  , ...,  0.33183146,
          2.8348486 ,  4.3758855 ],
        [11.215204  , -5.8257084 , 28.299002  , ..., -0.9804349 ,
         -3.4882178 ,  5.4295077 ]]], dtype=float32)>

In [41]:
i = mask_token_indices[0]
i

5

In [42]:
logits.shape

TensorShape([1, 9, 50265])

In [43]:
mask_token_logits = logits[0, i, :]
mask_token_logits.shape

TensorShape([50265])

In [44]:
mask_token_logits

<tf.Tensor: shape=(50265,), dtype=float32, numpy=
array([-3.1226814 , -4.801013  ,  2.9825027 , ..., -4.36097   ,
       -4.8979836 , -0.07158422], dtype=float32)>

In [45]:
top = tf.math.top_k(mask_token_logits, k=10)
top

TopKV2(values=<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([17.85983 , 16.52289 , 15.949996, 14.083212, 13.690206, 13.487871,
       13.002301, 12.865288, 12.575063, 12.507711], dtype=float32)>, indices=<tf.Tensor: shape=(10,), dtype=int32, numpy=
array([ 2674,  5863,  5548,  6543,  3366,  6813,  5440, 17771,  7080,
        7476])>)

In [46]:
for i in top.indices.numpy().tolist():
    print(id2word[i])

Ġfavorite
Ġcomfort
Ġfavourite
Ġsignature
Ġdream
Ġpreferred
Ġpassion
Ġstaple
Ġbreakfast
Ġeveryday


In [47]:
sequence = f"Pizza {tokenizer.mask_token} my {tokenizer.mask_token} food."
sequence

'Pizza <mask> my <mask> food.'

In [48]:
input_ids = tokenizer.encode(sequence, return_tensors='tf')

In [56]:
mask_token_indices = tf.where(input_ids[0] == tokenizer.mask_token_id)
mask_token_indices

<tf.Tensor: shape=(2, 1), dtype=int64, numpy=
array([[3],
       [5]], dtype=int64)>

In [57]:
mask_token_indices = tf.squeeze(mask_token_indices).numpy().tolist()
mask_token_indices

[3, 5]

In [50]:
result = model(input_ids)
logits = result[0]

In [58]:
for i in mask_token_indices:
    print(f"=== {i} ===")
    mask_token_logits = logits[0, i, :]
    top = tf.math.top_k(mask_token_logits, k=10)
    for i in top.indices.numpy().tolist():
        print(id2word[i])

=== 3 ===
Ġis
Ġwas
Ġas
Ġbecomes
Ġequals
Ġfor
Ġdelivers
Ġmakes
Ġand
Ġwith
=== 5 ===
Ġfavorite
Ġcomfort
Ġfavourite
Ġjunk
Ġbreakfast
Ġlunch
Ġown
Ġsnack
Ġfried
Ġpreferred
