# Installing Libraries
***
pip install pytorch-transformers

# Import Libraries

In [1]:
# Import required libraries
import torch
from pytorch_tbransformers import GPT2Tokenizer, GPT2LMHeadModel

# Predicting the next word using GPT-2

In [2]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

100%|█████████████████████████████████████████████████████████████████████| 1042301/1042301 [00:01<00:00, 672671.84B/s]
100%|███████████████████████████████████████████████████████████████████████| 456318/456318 [00:01<00:00, 358277.79B/s]


In [3]:
# Encode a text inputs
text = "What is the fastest car in the"
indexed_tokens = tokenizer.encode(text)
indexed_tokens

[1867, 318, 262, 14162, 1097, 287, 262]

In [4]:
# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.tensor([indexed_tokens])
tokens_tensor

tensor([[ 1867,   318,   262, 14162,  1097,   287,   262]])

In [5]:
# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

100%|██████████████████████████████████████████████████████████████████████████████| 665/665 [00:00<00:00, 83767.67B/s]
100%|█████████████████████████████████████████████████████████████████| 548118077/548118077 [16:39<00:00, 548230.76B/s]


In [6]:
# Set the model in evaluation mode to deactivate the DropOut modules
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [None]:
# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
model.to('cuda')

In [7]:
# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

In [8]:
# Get the predicted next sub-word
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

In [9]:
# Print the predicted word
print(predicted_text)

 What is the fastest car in the world


# Training a Masked Language Model for BERT
***
BERT is pre-trained using the following two unsupervised prediction tasks:
1. Masked Language Modeling (MLM)
2. Next Sentence Prediction 

## Problem Definition
***
Given an input sequence, we will randomly mask some words. The model then should predict the original value of the masked words, based on the context provided by the other, non-masked, words in the sequence.

## Solve

In [11]:
# import libraries
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM

In [12]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|███████████████████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 250318.80B/s]


In [13]:
# prepare tokenized input dari sebuah teks string dengan BertTokenizer
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
tokenized_text

['[CLS]',
 'who',
 'was',
 'jim',
 'henson',
 '?',
 '[SEP]',
 'jim',
 'henson',
 'was',
 'a',
 'puppet',
 '##eer',
 '[SEP]']

**Descriptions** <br>
[CLS] dan [SEP] itu digunakan sebagai penanda awal dan akhir dari suatu sequence

In [16]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

In [17]:
# convert tokenized input into a sequence of integers and create Pytorch tensor
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
indexed_tokens

[101,
 2040,
 2001,
 3958,
 27227,
 1029,
 102,
 3958,
 103,
 2001,
 1037,
 13997,
 11510,
 102]

101: [CLS] <br>
102: [SEP]

In [18]:
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
tokens_tensor, segments_tensors

(tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958,   103,  2001,
           1037, 13997, 11510,   102]]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]]))

Notice that we have set [MASK] at the 8th index in the sentence which is the word ‘Hensen’. This is what our model will try to predict.

In [19]:
# Load pre-trained model (weights)
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
bert_model.eval()

100%|██████████████████████████████████████████████████████████████████████████████| 433/433 [00:00<00:00, 48108.23B/s]
100%|█████████████████████████████████████████████████████████████████| 440473133/440473133 [12:34<00:00, 583556.47B/s]


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

In [None]:
torch

In [None]:
predictions

In [None]:
# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
assert predicted_token == 'henson'
print('Predicted token is:',predicted_token)