In [1]:
! pip install transformers
! pip install torch



In [16]:
from transformers import BertModel, AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
from scipy.spatial.distance import cosine
from scipy.special import softmax
import numpy as np

In [4]:
model_name = "bert-base-cased"

In [5]:
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architect

In [7]:
mask = tokenizer.mask_token
mask

'[MASK]'

In [14]:
sentence = f"I want to {mask} pizza for tonight."
tokens = tokenizer.tokenize(sentence)
tokens

['I', 'want', 'to', '[MASK]', 'pizza', 'for', 'tonight', '.']

In [10]:
encoded_inputs = tokenizer(text=sentence, return_tensors="pt")
output = model(**encoded_inputs)

In [20]:
logits = output.logits.detach().numpy()[0]
mask_logits = logits[tokens.index(mask)+1]
mask_logits.size

28996

In [40]:
confidence_score = softmax(mask_logits)
print(confidence_score.sum())
for i in np.argsort(confidence_score)[::-1][:5]: #arg correspond à l'indice du token dans le vocabulaire [::-1] permet d'inverser l'ordre de l'array
    pred_token = tokenizer.decode(i) #recupere le mot associé au token
    score = confidence_score[i]
    
    print(pred_token, score)


1.0
have 0.2572901
get 0.17849621
eat 0.15555479
make 0.11422449
order 0.09823056


In [36]:
np.argsort(confidence_score)[::-1][:5]

array([1138, 1243, 3940, 1294, 1546])

In [61]:
def predict(input):
    tokens = tokenizer.tokenize(input)
    encoded_inputs = tokenizer(text=input, return_tensors="pt")
    output = model(**encoded_inputs)
    logits = output.logits.detach().numpy()[0]
    mask_logits = logits[tokens.index(mask)+1]
    confidence_score = softmax(mask_logits)
    return [{"token":tokenizer.decode(i), "score":confidence_score[i]} for i in np.argsort(confidence_score)[::-1][:5]]    

In [62]:
predict(f"Is John Kennedy (JFK) dead? Answer (yes/no):{mask}.")

[{'token': 'No', 'score': 0.22621737},
 {'token': 'Yes', 'score': 0.15105757},
 {'token': 'NO', 'score': 0.04536177},
 {'token': 'no', 'score': 0.038355332},
 {'token': 'None', 'score': 0.021622486}]

In [65]:
predict(f"Using my mathematic calculator, 2+3=5, 1+8={mask}, 6+7=13")

[{'token': '5', 'score': 0.17081268},
 {'token': '6', 'score': 0.14476721},
 {'token': '7', 'score': 0.119548105},
 {'token': '8', 'score': 0.10006619},
 {'token': '9', 'score': 0.07793571}]