## 7.1 Introduction to the GPT family

In [1]:
from transformers import pipeline, set_seed, GPT2Tokenizer, GPT2LMHeadModel
from torch import tensor, numel
from bertviz import model_view

set_seed(42)

In [2]:
generator = pipeline('text-generation', model='gpt2')

generator("Hello, I'm a language model and I", max_length=30, num_return_sequences=3)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model and I can use it to get from one place to another (as a model to explain the code of Java)."},
 {'generated_text': "Hello, I'm a language model and I want to explore it when it gets released. So, today, I'm going to create a web version"},
 {'generated_text': "Hello, I'm a language model and I can program with no other tool than Javascript! It's great to run many software programs but it's also"}]

In [3]:
generator = pipeline('text-generation', model='gpt2')

generator("Hello, I'm a student of language model from Bhopal and I", max_length=30, num_return_sequences=3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a student of language model from Bhopal and I was in a class. I'd like to tell you about the language I"},
 {'generated_text': "Hello, I'm a student of language model from Bhopal and I was the only one who did not get my course to the level you say"},
 {'generated_text': "Hello, I'm a student of language model from Bhopal and I'd like to share with you my favorite text-to-speech-to"}]

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

'Gourav' in tokenizer.get_vocab()

False

In [5]:
tokenizer.convert_ids_to_tokens(tokenizer.encode('Gourav loves a beautiful day'))

['G', 'ou', 'rav', 'Ġloves', 'Ġa', 'Ġbeautiful', 'Ġday']

In [6]:
tokenizer.encode('Gourav loves a beautiful day')

[38, 280, 4108, 10408, 257, 4950, 1110]

In [7]:
encoded = tokenizer.encode('Gourav loves a beautiful day', return_tensors='pt')

encoded

tensor([[   38,   280,  4108, 10408,   257,  4950,  1110]])

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [9]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [10]:
model.transformer.wte(encoded).shape

torch.Size([1, 7, 768])

In [12]:
model.transformer.wpe(tensor([0, 1, 2, 3, 4, 5, 6]).reshape(1, 7)).shape

torch.Size([1, 7, 768])

In [11]:
initial_input = model.transformer.wte(encoded) + model.transformer.wpe(tensor([0, 1, 2, 3, 4, 5, 6]).reshape(1, 7))

initial_input.shape

torch.Size([1, 7, 768])

In [12]:
initial_input = model.transformer.drop(initial_input)
initial_input

tensor([[[-0.0341, -0.4186,  0.0979,  ..., -0.2589,  0.1409, -0.0730],
         [ 0.0632, -0.2021,  0.0855,  ..., -0.1184, -0.0353,  0.1251],
         [-0.0975, -0.2552,  0.2139,  ...,  0.1733, -0.0740, -0.0096],
         ...,
         [-0.0430, -0.0195,  0.1740,  ...,  0.0510,  0.0834, -0.0454],
         [ 0.0631,  0.1337,  0.1445,  ...,  0.0938,  0.0482,  0.1469],
         [-0.1352, -0.0499,  0.1171,  ..., -0.0941, -0.0689,  0.1134]]],
       grad_fn=<AddBackward0>)

In [9]:
model.lm_head

Linear(in_features=768, out_features=50257, bias=False)

In [13]:
for module in model.transformer.h:
    initial_input = module(initial_input)[0]
    
initial_input = model.transformer.ln_f(initial_input)

In [14]:
(initial_input == model(encoded, output_hidden_states=True).hidden_states[-1]).all()

tensor(True)

In [15]:
total_params = 0
for param in model.parameters():
    total_params += numel(param)
    
print(f'Number of params: {total_params:,}')

Number of params: 124,439,808


## 7.2 Masked multi-headed attention

In [22]:
import torch
import pandas as pd

In [16]:
phrase = 'My friend was right about this class. It is so fun!'
encoded_phrase = tokenizer(phrase, return_tensors='pt')

response = model(**encoded_phrase, output_attentions=True, output_hidden_states=True)

len(response.attentions)

12

In [17]:
encoded_phrase

{'input_ids': tensor([[3666, 1545,  373,  826,  546,  428, 1398,   13,  632,  318,  523, 1257,
            0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [18]:
response.attentions[-1].shape  # From the final decoder

torch.Size([1, 12, 13, 13])

In [19]:
encoded_phrase['input_ids'].shape

torch.Size([1, 13])

In [20]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0])

tokens

['My',
 'Ġfriend',
 'Ġwas',
 'Ġright',
 'Ġabout',
 'Ġthis',
 'Ġclass',
 '.',
 'ĠIt',
 'Ġis',
 'Ġso',
 'Ġfun',
 '!']

In [23]:
# Layer index 9, head 0. Check out the almost 60% attention the token it is giving to the token class
arr = response.attentions[9][0][0]

n_digits = 3

attention_df = pd.DataFrame((torch.round(arr * 10**n_digits) / (10**n_digits)).detach()).applymap(float)

attention_df.columns = tokens
attention_df.index = tokens

attention_df


Unnamed: 0,My,Ġfriend,Ġwas,Ġright,Ġabout,Ġthis,Ġclass,.,ĠIt,Ġis,Ġso,Ġfun,!
My,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġfriend,0.968,0.032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġwas,0.824,0.145,0.031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġright,0.979,0.008,0.007,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġabout,0.979,0.008,0.004,0.005,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġthis,0.924,0.031,0.007,0.006,0.016,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġclass,0.946,0.005,0.001,0.001,0.001,0.002,0.044,0.0,0.0,0.0,0.0,0.0,0.0
.,0.691,0.013,0.003,0.003,0.002,0.006,0.269,0.013,0.0,0.0,0.0,0.0,0.0
ĠIt,0.318,0.003,0.003,0.003,0.006,0.018,0.599,0.018,0.032,0.0,0.0,0.0,0.0
Ġis,0.331,0.006,0.002,0.002,0.003,0.018,0.533,0.013,0.062,0.03,0.0,0.0,0.0


In [25]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0]) 
model_view(response.attentions, tokens)

<IPython.core.display.Javascript object>

In [26]:
response.hidden_states[-1].shape

torch.Size([1, 13, 768])

In [27]:
response.logits.shape

torch.Size([1, 13, 50257])

In [28]:
pd.DataFrame(
    zip(tokens, tokenizer.convert_ids_to_tokens(response.logits.argmax(2)[0])), 
    columns=['Sequence up until', 'Next token with highest probability']
)

Unnamed: 0,Sequence up until,Next token with highest probability
0,My,Ċ
1,Ġfriend,","
2,Ġwas,Ġa
3,Ġright,.
4,Ġabout,Ġthat
5,Ġthis,.
6,Ġclass,.
7,.,ĠI
8,ĠIt,'s
9,Ġis,Ġa


In [29]:
generator(phrase, max_length=20, num_return_sequences=1, do_sample=False)  # greedy search

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My friend was right about this class. It is so fun! I love it! I love the'}]

In [30]:
generator(phrase, max_length=20, num_return_sequences=1, do_sample=True)  # greedy search with sampling

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My friend was right about this class. It is so fun! And then they ask me my opinion'}]

----------------------