In [1]:
!pip install transformers==4.11.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.11.3
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

importing the libraries

In [4]:
from transformers import BertTokenizer, BertForMaskedLM
import torch 

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Set the device to CPU
device = torch.device("cpu")

# Define the input text and tokenize it
input_text = "Le chat est [MASK] sur le canapé."
tokenized_text = tokenizer.tokenize(input_text)

# Find the index of the masked token
masked_index = tokenized_text.index('[MASK]')

# Convert the tokenized text to a tensor of token ids
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])

# Move the tokens tensor to the CPU
tokens_tensor = tokens_tensor.to(device)

# Generate predictions for the masked token using the model
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0][0, masked_index].topk(5)

# Convert the predicted token ids to tokens
predicted_token_ids = predictions.indices.tolist()
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

# Print the predicted tokens
print(predicted_tokens)

['représenté', 'présent', 'situé', 'porté', 'placé']


In [8]:
# Set the device to CPU
device = torch.device("cpu")

# Define the input text
input_text = "Le chat noir est sur le canapé. Le chat blanc est sur le tapis."

# Tokenize the input text
tokenized_text = tokenizer.tokenize(input_text)

# Loop through each token in the input text
for i in range(len(tokenized_text)):
    # If the token is a mask, replace it with the predicted token
    if tokenized_text[i] == '[MASK]':
        # Create a copy of the tokenized text and replace the mask with a placeholder token
        masked_tokenized_text = tokenized_text.copy()
        masked_tokenized_text[i] = '[PREDICT]'
        
        # Convert the masked tokenized text to a tensor of token ids
        masked_index = i
        masked_index_tensor = torch.tensor([masked_index])
        indexed_tokens = tokenizer.convert_tokens_to_ids(masked_tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])

        # Move the tokens tensor to the CPU
        tokens_tensor = tokens_tensor.to(device)

        # Generate predictions for the masked token using the model
        with torch.no_grad():
            outputs = model(tokens_tensor)
            predictions = outputs[0][0, masked_index_tensor].topk(5)

        # Convert the predicted token ids to tokens
        predicted_token_ids = predictions.indices.tolist()
        predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

        # Replace the placeholder token with the predicted token
        tokenized_text[i] = predicted_tokens[0]

# Convert the tokenized text to a string
predicted_text = tokenizer.convert_tokens_to_string(tokenized_text)

# Print the predicted text
print(predicted_text)

Le chat noir est sur le canapé . Le chat blanc est sur le tapis .


In [9]:
# Set up input sentence(s) with masked tokens
sentences = ["I want to [MASK] a new car", "The [MASK] is blue"]
tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]
masked_index = [sentence.index('[MASK]') for sentence in tokenized_sentences]

# Generate predictions for masked tokens
for i in range(len(sentences)):
    tokenized_text = tokenized_sentences[i]
    mask_pos = masked_index[i]
    
    # Replace masked token with [MASK] token
    tokenized_text[mask_pos] = '[MASK]'
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    
    # Predict probability distribution of next token
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0][0, mask_pos].topk(5).indices.tolist()

    # Print predicted tokens
    predicted_tokens = [tokenizer.convert_ids_to_tokens([prediction])[0] for prediction in predictions]
    print(f"For sentence '{sentences[i]}', the top 5 predicted tokens for the masked token are: {predicted_tokens}")

For sentence 'I want to [MASK] a new car', the top 5 predicted tokens for the masked token are: ['build', 'buy', 'make', 'create', 'construct']
For sentence 'The [MASK] is blue', the top 5 predicted tokens for the masked token are: ['It', 'This', 'it', 'What', 'There']


In [14]:
input_sentence = "I want to build a car. But, I don't know [MASK] [MASK] [MASK] is the right way to do it."
input_sentence += '.' # Add period to end of sentence
first_sentence, second_sentence = input_sentence.split('.')
tokenized_sentences = tokenizer.tokenize(first_sentence + '.' + ' [MASK] [MASK] [MASK]' + '.')
mask_pos = tokenized_sentences.index('[MASK]')
# Replace masked tokens with [MASK] token
tokenized_sentences[mask_pos] = '[MASK]'
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_sentences)
tokens_tensor = torch.tensor([indexed_tokens])

# Predict probability distribution of next sentence
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0].squeeze()

# Print predicted sentence
predicted_sentence = " ".join(tokenizer.convert_ids_to_tokens(predictions.tolist()))
print(f"For input sentence '{input_sentence}', the predicted masked sentence is: {predicted_sentence}")

ValueError: ignored