## Introduction to the GPT family

In [1]:
from transformers import pipeline, set_seed, AutoTokenizer, AutoModelForCausalLM, AutoModel
from torch import tensor, numel
from bertviz import model_view, head_view
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

set_seed(42)

In [2]:
# pipeline object in transformers provides easy access to transformer usage
MODEL = 'gpt2'

generator = pipeline('text-generation', model=MODEL)

# finish the sentence
generator("Hello, I'm a language model and I", max_length=30, num_return_sequences=3)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model and I want to be able to understand these things. It's not just a game, it's a process."},
 {'generated_text': "Hello, I'm a language model and I'm not just talking about semantics and syntax. One of the things I find interesting about this is that you"},
 {'generated_text': "Hello, I'm a language model and I've come up with something that doesn't suck – what you do when you try and figure out what you"}]

In [3]:
# load up a tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

'Sinan' in tokenizer.get_vocab()

False

In [4]:
tokenizer.encode('Sinan loves a beautiful day')

[46200, 272, 10408, 257, 4950, 1110]

In [5]:
# encode a string and then convert the ids back into tokens. Note the Ġ character denoting a space before the token
tokenizer.convert_ids_to_tokens(tokenizer.encode('Sinan loves a beautiful day'))


['Sin', 'an', 'Ġloves', 'Ġa', 'Ġbeautiful', 'Ġday']

In [6]:
tokenizer.encode('Sinan loves a beautiful day')  # ids

[46200, 272, 10408, 257, 4950, 1110]

In [7]:
encoded = tokenizer.encode('Sinan loves a beautiful day', return_tensors='pt')  # as a pytorch tensor

encoded

tensor([[46200,   272, 10408,   257,  4950,  1110]])

In [8]:
# load up a tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL)

In [9]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [10]:
encoded

tensor([[46200,   272, 10408,   257,  4950,  1110]])

In [11]:
model.transformer.wte(encoded)

tensor([[[ 0.0295, -0.0479,  0.1235,  ..., -0.1539, -0.0276,  0.0994],
         [-0.1253, -0.0356,  0.0570,  ..., -0.0876, -0.0628,  0.0048],
         [-0.0758, -0.0843, -0.0159,  ..., -0.2232, -0.0390, -0.0899],
         [-0.0506,  0.0056,  0.0471,  ...,  0.0425,  0.0735, -0.0384],
         [ 0.0535,  0.1676,  0.0133,  ...,  0.0880,  0.0411,  0.1544],
         [-0.1379, -0.0294, -0.0026,  ..., -0.0966, -0.0726,  0.1160]]],
       grad_fn=<EmbeddingBackward0>)

In [12]:
model.transformer.wte(encoded).shape  # 1 item in batch x 6 tokens x token dimension

torch.Size([1, 6, 768])

In [13]:
model.transformer.wpe(tensor([0, 1, 2, 3, 4, 5]).reshape(1, 6)).shape  # manually create position vectors

torch.Size([1, 6, 768])

In [14]:
# create GPT input
initial_input = model.transformer.wte(encoded) + model.transformer.wpe(tensor([0, 1, 2, 3, 4, 5]).reshape(1, 6))

initial_input.shape


torch.Size([1, 6, 768])

In [15]:
initial_input = model.transformer.drop(initial_input)  # run our input through the model's initual dropout later
initial_input

tensor([[[ 0.0107, -0.2453,  0.1275,  ..., -0.1969,  0.0006,  0.1539],
         [-0.1013, -0.0894, -0.0378,  ..., -0.0534, -0.0527,  0.0046],
         [-0.0716, -0.1690,  0.0386,  ..., -0.2034, -0.0197, -0.1113],
         [-0.0509, -0.0682,  0.1526,  ...,  0.0527,  0.0912, -0.0455],
         [ 0.0612,  0.1425,  0.1402,  ...,  0.0964,  0.0510,  0.1474],
         [-0.1283, -0.0632,  0.1287,  ..., -0.0907, -0.0655,  0.1085]]],
       grad_fn=<AddBackward0>)

In [16]:
model.lm_head

Linear(in_features=768, out_features=50257, bias=False)

In [17]:
for module in model.transformer.h:  # run the initial_input through every decoder in the stack
    initial_input = module(initial_input)[0]
    
initial_input = model.transformer.ln_f(initial_input)  # and then the final layer norm

In [18]:
initial_input

tensor([[[ 0.0542, -0.0179, -0.3388,  ..., -0.0948, -0.1067,  0.0129],
         [-0.4805,  0.1008, -0.7313,  ...,  0.0471, -0.4113,  0.0902],
         [ 0.0344, -0.2259, -0.5293,  ..., -0.1202,  0.1355,  0.2287],
         [-0.2374,  0.1787,  0.1845,  ..., -0.4057, -0.3617, -0.1861],
         [ 0.0235,  0.1212, -1.0182,  ..., -0.0597,  0.0020, -0.2220],
         [ 0.1206, -0.5034, -1.5260,  ..., -0.3367, -0.2821, -0.0410]]],
       grad_fn=<NativeLayerNormBackward0>)

In [19]:
# same as just running through the model
(initial_input == model(encoded, output_hidden_states=True).hidden_states[-1]).all()

tensor(True)

In [20]:
model(encoded).logits.shape

torch.Size([1, 6, 50257])

In [21]:
total_params = 0
for param in model.parameters():
    total_params += numel(param)
    
print(f'Number of params: {total_params:,}')

Number of params: 124,439,808


## Masked multi-headed attention

In [22]:
import torch
import pandas as pd


In [23]:
phrase = 'My friend was right about this class. It is so fun!'
encoded_phrase = tokenizer(phrase, return_tensors='pt')

response = model(**encoded_phrase, output_attentions=True, output_hidden_states=True)

len(response.attentions)

12

In [24]:
encoded_phrase

{'input_ids': tensor([[3666, 1545,  373,  826,  546,  428, 1398,   13,  632,  318,  523, 1257,
            0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [25]:
response.attentions[-1].shape  # represtnations from the final decoder

torch.Size([1, 12, 13, 13])

In [26]:
encoded_phrase['input_ids'].shape

torch.Size([1, 13])

In [27]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0])

tokens

['My',
 'Ġfriend',
 'Ġwas',
 'Ġright',
 'Ġabout',
 'Ġthis',
 'Ġclass',
 '.',
 'ĠIt',
 'Ġis',
 'Ġso',
 'Ġfun',
 '!']

In [28]:
# Layer index 9, head 0. Check out the almost 60% attention the token it is giving to the token class
arr = response.attentions[9][0][0]

n_digits = 3

attention_df = pd.DataFrame((torch.round(arr * 10**n_digits) / (10**n_digits)).detach()).applymap(float)

attention_df.columns = tokens
attention_df.index = tokens

attention_df


Unnamed: 0,My,Ġfriend,Ġwas,Ġright,Ġabout,Ġthis,Ġclass,.,ĠIt,Ġis,Ġso,Ġfun,!
My,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġfriend,0.968,0.032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġwas,0.824,0.145,0.031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġright,0.979,0.008,0.007,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġabout,0.979,0.008,0.004,0.005,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġthis,0.924,0.031,0.007,0.006,0.016,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġclass,0.946,0.005,0.001,0.001,0.001,0.002,0.044,0.0,0.0,0.0,0.0,0.0,0.0
.,0.691,0.013,0.003,0.003,0.002,0.006,0.269,0.013,0.0,0.0,0.0,0.0,0.0
ĠIt,0.318,0.003,0.003,0.003,0.006,0.018,0.599,0.018,0.032,0.0,0.0,0.0,0.0
Ġis,0.331,0.006,0.002,0.002,0.003,0.018,0.533,0.013,0.062,0.03,0.0,0.0,0.0


In [29]:
head_view(response.attentions, tokens)

<IPython.core.display.Javascript object>

In [30]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0]) 
model_view(response.attentions, tokens)

<IPython.core.display.Javascript object>

In [31]:
response.hidden_states[-1].shape

torch.Size([1, 13, 768])

In [32]:
response.logits

tensor([[[ -33.0736,  -32.3349,  -35.2380,  ...,  -38.3577,  -38.4758,
           -33.0943],
         [ -98.8075,  -98.9729, -105.2267,  ..., -109.0421, -104.7381,
          -101.0802],
         [-131.2764, -130.8178, -135.2054,  ..., -138.3585, -134.9846,
          -134.1658],
         ...,
         [-120.6479, -122.7568, -127.1870,  ..., -130.4343, -129.5815,
          -125.1274],
         [ -73.8157,  -78.6168,  -85.3994,  ...,  -91.8287,  -91.7845,
           -81.4569],
         [-134.7545, -134.9494, -136.2912,  ..., -147.1822, -148.1732,
          -130.7045]]], grad_fn=<UnsafeViewBackward0>)

In [33]:
response.logits.shape

torch.Size([1, 13, 50257])

In [34]:
# look at the top next token in the auto-regressive language modelling task
pd.DataFrame(
    zip(tokens, tokenizer.convert_ids_to_tokens(response.logits.argmax(2)[0])), 
    columns=['Sequence up until', 'Next token with highest probability']
)

Unnamed: 0,Sequence up until,Next token with highest probability
0,My,Ċ
1,Ġfriend,","
2,Ġwas,Ġa
3,Ġright,.
4,Ġabout,Ġthat
5,Ġthis,.
6,Ġclass,.
7,.,ĠI
8,ĠIt,'s
9,Ġis,Ġa


In [35]:
generator('My friend was right', max_length=12, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My friend was right," she said in a phone interview from'},
 {'generated_text': 'My friend was right." It\'s really unfortunate now. I'},
 {'generated_text': 'My friend was right; I understand the rules. All these'},
 {'generated_text': 'My friend was right - I was wrong," the man said'},
 {'generated_text': "My friend was right. But I don't know how to"}]

In [36]:
generator(phrase, max_length=20, num_return_sequences=1, do_sample=False)  # greedy search

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My friend was right about this class. It is so fun! I love it! I love the'}]

In [37]:
generator(phrase, max_length=20, num_return_sequences=1, do_sample=True)  # greedy search with sampling

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "My friend was right about this class. It is so fun! And it's not that hard,"}]