In [2]:
!pip install transformers # if doing this from colab or interactive session

from transformers import GPT2Tokenizer, GPT2LMHeadModel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [3]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

## A note about GPT2's vocabulary

All modern models do not represent "words" as part of their vocabulary. These models represent "subwords".

A subword vocabulary includes all base characters, many multi-character sequences, and things we would probably call _words_. 

The vocabulary is constructed using some interesting algorithms:

1. Byte Pair Encoding/Wordpiece -- Adding

2. UnigramLM -- Pruning

Each model can have its own idiosyncratic tokenization scheme -- GPT2 tokens and RoBERTa and BERT all differ from each other and differ from what people would probably do.

How do (1) and (2) differ from how words are actually structured? Next week, we will cover morphology and its relationship to text normalization. For now, we'll say that everything GPT-2 predicts is a "word" 

In [4]:
tokenizer("A sentence tokenized by GPT-2")

{'input_ids': [32, 6827, 11241, 1143, 416, 402, 11571, 12, 17], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
tokenizer.convert_ids_to_tokens([32, 6827, 11241, 1143, 416, 402, 11571, 12, 17])

['A', 'Ġsentence', 'Ġtoken', 'ized', 'Ġby', 'ĠG', 'PT', '-', '2']

In [8]:
tokenizer.convert_ids_to_tokens(
    tokenizer("pneumonia")['input_ids']
)

['p', 'neum', 'onia']

In [9]:
tokenizer.convert_ids_to_tokens(
    tokenizer("flabbergasted")['input_ids']
)

['fl', 'ab', 'berg', 'asted']

The result of the tokenizers in the Huggingface API for a neural language model is a set of numbers called `"input_ids"`

* List of "numbers" in the order they appear in the sentence
* Each number refers to the presence of a specific subword at that position
* Basically a dictionary whose keys are indices and whose values are strings
* Imagine if you assigned a random integer to every word in a database of count statistics
* Effectively tell you the coordinates of what word is present

## GPT-2's objective is next-word prediction

* GPT-2 accomplishes neural language modeling by using massive neural network architectures -- tons of matrices and matrix math -- to predict the next word in a sequence, given preceding information
* Conceptually similar to forward transition probabilities (one kind of conditional probability)

## The `GPT2LMHeadModel` is the part of the neural network that predicts

* Model predicts next word
* Update the model if the model is wrong
* More wrong --> More learning

In [10]:
outputs = model(
    **tokenizer("A sentence tokenized by GPT-2", return_tensors='pt')
    )

In [11]:
outputs.keys()

odict_keys(['logits', 'past_key_values'])

In [13]:
outputs.logits

tensor([[[ -33.5706,  -32.7689,  -35.4509,  ...,  -40.9807,  -40.1867,
           -33.2152],
         [-114.9282, -114.5170, -122.9434,  ..., -124.6224, -124.5662,
          -116.6161],
         [-100.3657, -100.7182, -106.2230,  ..., -109.8270, -110.5367,
          -102.0635],
         ...,
         [-104.3089, -104.2943, -104.1973,  ..., -111.9490, -112.4775,
          -104.3001],
         [ -85.2682,  -85.0643,  -84.1610,  ...,  -92.0905,  -93.8388,
           -84.3858],
         [ -94.7473,  -94.6730,  -95.4357,  ..., -100.6837, -102.2796,
           -94.0545]]], grad_fn=<UnsafeViewBackward0>)

In [14]:
outputs.logits.shape # one 50257-dimensional vector for each of the 9 subwords

torch.Size([1, 9, 50257])

In [15]:
# turn the logits into probabilities

from torch.nn import Softmax

sm = Softmax(dim=2)

In [16]:
sm(outputs.logits)  # probabilities from logits

tensor([[[1.2017e-03, 2.6790e-03, 1.8330e-04,  ..., 7.2715e-07,
          1.6087e-06, 1.7145e-03],
         [2.9332e-04, 4.4253e-04, 9.6913e-08,  ..., 1.8081e-08,
          1.9125e-08, 5.4237e-05],
         [7.1168e-04, 5.0025e-04, 2.0346e-06,  ..., 5.5372e-08,
          2.7232e-08, 1.3029e-04],
         ...,
         [5.4421e-04, 5.5221e-04, 6.0849e-04,  ..., 2.6165e-07,
          1.5424e-07, 5.4902e-04],
         [2.1447e-05, 2.6297e-05, 6.4894e-05,  ..., 2.3359e-08,
          4.0663e-09, 5.1832e-05],
         [3.0926e-04, 3.3314e-04, 1.5537e-04,  ..., 8.1695e-07,
          1.6562e-07, 6.1832e-04]]], grad_fn=<SoftmaxBackward0>)

In [17]:
sm(outputs.logits).shape

torch.Size([1, 9, 50257])

In [18]:
sm(outputs.logits).sum(axis=2) # show that the outputs sum to 1 this way

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000]],
       grad_fn=<SumBackward1>)

# How to get probability of each specific token?

Recall that each dimension in this 50257-dimensional vector is an id associated with a specific vocabulary term! This means we just need to index into *that* position for each of our subwords

So, look back at our `"input_ids"`!

In [19]:
input_ids = tokenizer("A sentence tokenized by GPT-2")['input_ids']
input_ids

[32, 6827, 11241, 1143, 416, 402, 11571, 12, 17]

In [20]:
probabilities_of_each_subword = sm(outputs.logits)

probabilities_of_each_subword[:, :, input_ids]  # probability of each input ID from full input at each time step

tensor([[[1.6781e-03, 3.3081e-05, 2.3910e-05, 4.7826e-05, 1.9807e-03,
          6.6236e-04, 1.4991e-05, 1.6517e-02, 1.7089e-03],
         [1.1369e-06, 1.2789e-03, 1.0897e-06, 1.9789e-06, 3.2322e-03,
          1.5868e-05, 1.9008e-10, 9.4030e-04, 2.5563e-06],
         [1.0467e-05, 7.0829e-05, 3.3353e-04, 1.9483e-02, 9.9775e-04,
          2.7961e-05, 1.6485e-08, 1.6244e-03, 2.3933e-05],
         [3.1342e-06, 6.1089e-04, 5.1104e-04, 6.0330e-06, 2.8835e-01,
          2.4070e-05, 3.8749e-09, 5.7590e-04, 9.3669e-06],
         [3.6744e-06, 4.7002e-04, 3.9246e-03, 1.8097e-06, 1.2439e-04,
          5.2205e-04, 2.0951e-08, 1.5346e-04, 3.8084e-06],
         [3.9218e-04, 1.2035e-06, 2.5081e-06, 1.6580e-05, 3.5139e-05,
          1.1709e-04, 4.6554e-04, 1.5026e-02, 2.2184e-03],
         [1.2305e-03, 3.4958e-05, 2.5759e-03, 1.3882e-05, 2.6604e-03,
          3.2405e-04, 7.1076e-05, 2.8870e-02, 2.0030e-03],
         [5.4327e-03, 3.1777e-07, 4.0347e-05, 4.2731e-05, 1.4796e-05,
          1.1126e-05, 3.566

In [21]:
probabilities_of_each_subword[:, :, input_ids].shape

torch.Size([1, 9, 9])

In [25]:
import torch

In [28]:
token_probs = []
for i, input_id in enumerate(input_ids):
  token_prob = probabilities_of_each_subword[:, i, input_id]
  token_probs.append(token_prob)
probs_tensor = torch.Tensor(token_probs)

probs_tensor.log().sum()

tensor(-78.6279)