In [1]:
!pip install pytorch-transformers

Collecting pytorch-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |████████████████████████████████| 184kB 6.4MB/s 
[?25hCollecting sacremoses (from pytorch-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/27/04/b92425ca552116afdb7698fa3f00ca1c975cfd86a847cf132fd813c5d901/sacremoses-0.0.34.tar.gz (859kB)
[K     |████████████████████████████████| 860kB 46.0MB/s 
Collecting sentencepiece (from pytorch-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 48.2MB/s 
Collecting regex (from pytorch-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/6f/a6/99eeb5904ab763db87af4bd71d9b1dfdd97926812406

### Import the necessary libraries

In [0]:
import torch
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel

### Loading pre-trained model tokenizer

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

100%|██████████| 1042301/1042301 [00:00<00:00, 2080009.85B/s]
100%|██████████| 456318/456318 [00:00<00:00, 1124592.68B/s]


### Encode text inputs

In [4]:
text = "What is the fastest car in the"
indexed_tokens = tokenizer.encode(text)
indexed_tokens

[1867, 318, 262, 14162, 1097, 287, 262]

### Convert indexed tokens in a Pytorch tensor

In [5]:
tokens_tensor = torch.tensor([indexed_tokens])
tokens_tensor

tensor([[ 1867,   318,   262, 14162,  1097,   287,   262]])

### Load pre-trained model (weights)

In [6]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model

100%|██████████| 176/176 [00:00<00:00, 79555.72B/s]
100%|██████████| 548118077/548118077 [00:19<00:00, 27894220.16B/s]


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1)
          (resid_dropout): Dropout(p=0.1)
        )
        (ln_2): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1)
        )
      )
      (1): Block(
        (ln_1): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1)
          (resid_dropout): Dropout(p=0.1)
        )
        (ln_2): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=Tr

### Set the model in evaluation mode to deactivate the Dropout modules

In [7]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1)
          (resid_dropout): Dropout(p=0.1)
        )
        (ln_2): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1)
        )
      )
      (1): Block(
        (ln_1): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1)
          (resid_dropout): Dropout(p=0.1)
        )
        (ln_2): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=Tr

### GPU usage

In [8]:
# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
model.to('cuda')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1)
          (resid_dropout): Dropout(p=0.1)
        )
        (ln_2): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1)
        )
      )
      (1): Block(
        (ln_1): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1)
          (resid_dropout): Dropout(p=0.1)
        )
        (ln_2): LayerNorm(torch.Size([768]), eps=1e-05, elementwise_affine=Tr

### Predict tokens

In [0]:
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

In [14]:
outputs

(tensor([[[ -37.9884,  -37.9580,  -41.4891,  ...,  -44.0718,  -43.7976,
            -37.7048],
          [ -90.5062,  -88.9512,  -95.2817,  ...,  -95.2052,  -95.9588,
            -90.8715],
          [ -96.4428,  -94.5894,  -97.7109,  ...,  -96.9249, -100.0142,
            -94.3135],
          ...,
          [ -94.2190,  -94.6733,  -97.5500,  ..., -104.5247, -103.3913,
            -95.9648],
          [ -66.9001,  -66.0432,  -69.7153,  ...,  -75.6980,  -73.9600,
            -66.7941],
          [ -96.1219,  -94.2472,  -96.9560,  ..., -103.5570, -100.5183,
            -95.6673]]], device='cuda:0'),
 (tensor([[[[[-1.3259e+00,  1.9205e+00,  7.5023e-01,  ..., -1.1690e+00,
              -2.8029e-01,  1.5991e+00],
             [-1.8348e+00,  2.4955e+00,  1.7497e+00,  ..., -1.5397e+00,
              -2.3685e+00,  2.4482e+00],
             [-2.2444e+00,  2.6332e+00,  1.9227e+00,  ..., -6.7221e-01,
              -1.5328e+00,  2.0305e+00],
             ...,
             [-2.1348e+00,  4.0035e+00

### Get the predicted next sub-word

In [0]:
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

In [11]:
print(predicted_text)

 What is the fastest car in the world


## Script to generate text with a starting seed text using pytorch-transformers from huggingface

In [12]:
!git clone https://github.com/huggingface/pytorch-transformers.git

Cloning into 'pytorch-transformers'...
remote: Enumerating objects: 8346, done.[K
remote: Total 8346 (delta 0), reused 0 (delta 0), pack-reused 8346[K
Receiving objects: 100% (8346/8346), 4.49 MiB | 6.22 MiB/s, done.
Resolving deltas: 100% (6006/6006), done.


In [13]:
!python pytorch-transformers/examples/run_generation.py \
    --model_type=gpt2 \
    --length=100 \
    --model_name_or_path=gpt2 \

09/20/2019 12:20:50 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /root/.cache/torch/pytorch_transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
09/20/2019 12:20:50 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /root/.cache/torch/pytorch_transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
09/20/2019 12:20:51 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /root/.cache/torch/pytorch_transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d8038