# High-level API using pipeline

Let us first apply a fill-mask task using huggingface's high-level fill-mask API:

In [3]:
from transformers import pipeline

fill_mask = pipeline('fill-mask', model="bert-base-german-dbmdz-cased")
fill_mask("Bestimmt kennen Sie einige Funktionen moderner [MASK]?")

Some weights of the model checkpoint at bert-base-german-dbmdz-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'sequence': 'Bestimmt kennen Sie einige Funktionen moderner Technik?',
  'score': 0.04415518045425415,
  'token': 5808,
  'token_str': 'Technik'},
 {'sequence': 'Bestimmt kennen Sie einige Funktionen moderner Software?',
  'score': 0.041537947952747345,
  'token': 5177,
  'token_str': 'Software'},
 {'sequence': 'Bestimmt kennen Sie einige Funktionen moderner Medien?',
  'score': 0.02656109817326069,
  'token': 3562,
  'token_str': 'Medien'},
 {'sequence': 'Bestimmt kennen Sie einige Funktionen moderner Art?',
  'score': 0.026073254644870758,
  'token': 1622,
  'token_str': 'Art'},
 {'sequence': 'Bestimmt kennen Sie einige Funktionen moderner Geräte?',
  'score': 0.021198052912950516,
  'token': 7612,
  'token_str': 'Geräte'}]

# More details using lower-level API

Let's now dive into more details how this is done under the hood. First some helper code:

In [4]:
import numpy as np
import torch
import html

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForCausalLM
from IPython.display import display, HTML


class Demo:
    @property
    def model(self):
        return self._model

    @property
    def tokenizer(self):
        return self._tokenizer
    
    def print_variants(self, inputs, masked_index, k=10):
        token_ids = inputs["input_ids"][0]
        for x in self.suggest_token_ids(inputs, masked_index, k):
            parts = [
                html.escape(self._tokenizer.decode(token_ids[:masked_index])),
                ' <span style="color:red;">',
                html.escape(self._tokenizer.decode([x])),
                '</span> ',
                html.escape(self._tokenizer.decode(token_ids[masked_index + 1:]))]
            display(HTML("".join(parts)))
    
    
class MaskedLMDemo(Demo):
    def __init__(self, model_name):
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)
        self._model = AutoModelForMaskedLM.from_pretrained(model_name)
        self._model.eval()

    def suggest_token_ids(self, inputs, masked_index, k=10):
        model = self._model
        
        token_ids = inputs["input_ids"]
        assert len(token_ids.shape) == 2
        assert token_ids.shape[0] == 1

        mask_token_id = self._tokenizer.mask_token_id
        masked_token_ids = token_ids.clone().detach()
        masked_token_ids[0, masked_index] = mask_token_id

        args = inputs.copy()
        args["input_ids"] = masked_token_ids
        
        with torch.no_grad():
            outputs = model(**args)

        logits = outputs.logits[0]
        predictions = logits[masked_index]
        softmax = torch.nn.functional.softmax(predictions, dim=-1)
        
        values, indices = torch.topk(softmax, k)
        return indices
    
    
class CausalLMDemo(Demo):
    def __init__(self, model_name):
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)
        self._model = AutoModelForCausalLM.from_pretrained(
            model_name, is_decoder=True)
        self._model.eval()

    def suggest_token_ids(self, inputs, masked_index, k=10):
        model = self._model
        
        token_ids = inputs["input_ids"]
        assert len(token_ids.shape) == 2
        assert token_ids.shape[0] == 1

        with torch.no_grad():
            outputs = model(**inputs, labels=token_ids)
            
        logits = outputs.logits[0]
        predictions = logits[masked_index]
        softmax = torch.nn.functional.softmax(predictions, dim=-1)
        
        values, indices = torch.topk(softmax, k)
        return indices

# Masked Language Modelling using BERT

In [5]:
# here we use the German BERT model from the Bavarian State Library, see
# https://huggingface.co/dbmdz/bert-base-german-cased

# you might also want to try:
# * dbmdz/bert-base-german-europeana-cased
# * dbmdz/distilbert-base-german-europeana-cased

# see https://huggingface.co/dbmdz/bert-base-german-europeana-cased and
# https://huggingface.co/dbmdz/distilbert-base-german-europeana-cased

demo_mask = MaskedLMDemo("bert-base-german-dbmdz-cased")

Some weights of the model checkpoint at bert-base-german-dbmdz-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
faust_quote = """
Habe nun, ach! Philosophie,
Juristerei and Medizin,
Und leider auch Theologie
Durchaus studiert, mit heißem Bemühn.
Da steh' ich nun, ich armer Tor,
Und bin so klug als wie zuvor!
"""

Let us first tokenize this input using our BERT model:

In [7]:
inputs_mask = demo_mask.tokenizer(faust_quote, return_tensors="pt")

inputs_mask

{'input_ids': tensor([[  102, 15454,  1269,   818, 23070,  3330,  9186,   818, 19228,  5736,
          1257,  6070,   818,   700,  8433,   313, 12603,  1209,   322, 17722,
           818,   212, 15996, 30895,  5614,  6018, 30882,   566,   867,  4729,
         30889,  2119,   383,  1269,   818,   383, 28983, 30884,  2270,   818,
           700,  1089,   262, 29107,   276,   335,  3489,  3330,   103]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}

To actually understand the tokenization, let's convert the token ids into token strings:

In [8]:
tokens_mask = demo_mask.tokenizer.convert_ids_to_tokens(
    inputs_mask["input_ids"][0])
tokens_mask

['[CLS]',
 'Habe',
 'nun',
 ',',
 'ach',
 '!',
 'Philosophie',
 ',',
 'Jurist',
 '##erei',
 'and',
 'Medizin',
 ',',
 'Und',
 'leider',
 'auch',
 'Theologie',
 'Durch',
 '##aus',
 'studiert',
 ',',
 'mit',
 'heiße',
 '##m',
 'Bem',
 '##üh',
 '##n',
 '.',
 'Da',
 'ste',
 '##h',
 "'",
 'ich',
 'nun',
 ',',
 'ich',
 'arme',
 '##r',
 'Tor',
 ',',
 'Und',
 'bin',
 'so',
 'klug',
 'als',
 'wie',
 'zuvor',
 '!',
 '[SEP]']

We can convert the sequence of token ids back into a string:

In [9]:
demo_mask.tokenizer.decode(inputs_mask["input_ids"][0])

"[CLS] Habe nun, ach! Philosophie, Juristerei and Medizin, Und leider auch Theologie Durchaus studiert, mit heißem Bemühn. Da steh'ich nun, ich armer Tor, Und bin so klug als wie zuvor! [SEP]"

Let's now get suggestions for replacing a token. The following call gives us the ids of token that would make sense at one location according to BERT:

In [10]:
demo_mask.suggest_token_ids(inputs_mask, tokens_mask.index("leider"))

tensor([ 3392,  1269,   704,   742,  2913,  1184,  1202,   494, 30170,  1619])

It is easy to convert these ids into token strings:

In [11]:
demo_mask.tokenizer.convert_ids_to_tokens(
    demo_mask.suggest_token_ids(inputs_mask, tokens_mask.index("leider")))

['natürlich',
 'nun',
 'habe',
 'dann',
 'schließlich',
 'jetzt',
 'eben',
 'aber',
 'nebenbei',
 'ja']

Now let's use a utility function to replace a token with some other candidates the model suggests and print the result as HTML:

In [12]:
demo_mask.print_variants(
    inputs_mask, tokens_mask.index("leider"), k=5)

In [13]:
demo_mask.print_variants(
    inputs_mask, tokens_mask.index("studiert"), k=5)

In [14]:
demo_mask.print_variants(
    inputs_mask, tokens_mask.index("Tor"), k=5)

# Causal LM using GottBERT (a RoBERTa model)

In [15]:
demo_causal = CausalLMDemo("uklfr/gottbert-base")

In [16]:
inputs_c = demo_causal.tokenizer(faust_quote, return_tensors="pt")

inputs_c

{'input_ids': tensor([[    0, 51963,  8030,   196,     5, 15398,    73,  5846,     5, 51963,
           800,   578,  5433,   671,  1138,  2880,     5, 51963,   904,   747,
            25, 14737, 51963,  1925,   690, 11763,     5,    12, 33640, 14133,
          1376,  3506,     4, 51963,  1255, 28538,   673,    32,   196,     5,
            32,  4875,  1144,  1594,     5, 51963,   904,   209,    55, 24722,
            37,    44,  1756,    73, 51963,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}

Let us note how the tokenization from this model is different from the first one we used:

In [17]:
tokens_c = demo_causal.tokenizer.convert_ids_to_tokens(
    inputs_c["input_ids"][0])
tokens_c

['<s>',
 'Ċ',
 'Habe',
 'Ġnun',
 ',',
 'Ġach',
 '!',
 'ĠPhilosophie',
 ',',
 'Ċ',
 'J',
 'ur',
 'ister',
 'ei',
 'Ġand',
 'ĠMedizin',
 ',',
 'Ċ',
 'Und',
 'Ġleider',
 'Ġauch',
 'ĠTheologie',
 'Ċ',
 'Durch',
 'aus',
 'Ġstudiert',
 ',',
 'Ġmit',
 'ĠheiÃŁem',
 'ĠBem',
 'Ã¼',
 'hn',
 '.',
 'Ċ',
 'Da',
 'Ġsteh',
 "'",
 'Ġich',
 'Ġnun',
 ',',
 'Ġich',
 'Ġar',
 'mer',
 'ĠTor',
 ',',
 'Ċ',
 'Und',
 'Ġbin',
 'Ġso',
 'Ġklug',
 'Ġals',
 'Ġwie',
 'Ġzuvor',
 '!',
 'Ċ',
 '</s>']

In [18]:
demo_causal.print_variants(inputs_c, tokens_c.index("Ġleider"), k=5)

In [19]:
demo_causal.print_variants(inputs_c, tokens_c.index("ĠTor"), k=10)