# GPT

In [1]:
from transformers import pipeline, set_seed, GPT2Tokenizer, GPT2LMHeadModel
from torch import tensor, numel
from bertviz import model_view

set_seed(31)

In [2]:
generator = pipeline('text-generation', model='gpt2')

Device set to use mps:0


In [28]:
generator('If you think I will agree to your proposal, you are wrong and this is why.', max_length=30, num_return_sequences=3, truncation=True, temperature=0.5, top_k=6)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': "If you think I will agree to your proposal, you are wrong and this is why.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n\nIf you don't agree, you will be asked to leave.\n"},
 {'generated_text': "If you thin

In [4]:
# "gpt2" is by default "cased" (opposite of uncased)
#
# Whether something is "cased" or "uncased" makes 
# more sense to BERT for a Natural Language understanding task
# than to GPT that really needs the "cased" version to understand
# cases and accents to generate text.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

print('Simone' in tokenizer.get_vocab())
print('table' in tokenizer.get_vocab())

False
True


In [5]:
encoding = tokenizer.encode('Simone likes a good pizza')
encoding

[8890, 505, 7832, 257, 922, 14256]

In [6]:
tokenizer.convert_ids_to_tokens(encoding)

['Sim', 'one', 'Ġlikes', 'Ġa', 'Ġgood', 'Ġpizza']

In [7]:
encoding = tokenizer.encode('Simone likes a good pizza', return_tensors='pt') 
encoding

tensor([[ 8890,   505,  7832,   257,   922, 14256]])

## Build the model

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [9]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [10]:
model.transformer.wte(encoding).shape

torch.Size([1, 6, 768])

# Masked multi-headed attention

In [22]:
import torch
import pandas as pd

In [23]:
phrase = 'My friend was right about this class. It is so fun!'
encoded_phrase = tokenizer(phrase, return_tensors='pt')

In [24]:
encoded_phrase

{'input_ids': tensor([[3666, 1545,  373,  826,  546,  428, 1398,   13,  632,  318,  523, 1257,
            0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [25]:
response = model(**encoded_phrase, output_attentions=True, output_hidden_states=True)

In [26]:
len(response.attentions)

12

In [16]:
# Access the attention tensor of the last decoder
# [1, 12, 13, 13]
# - 1. first dimension represents the batch size of 1 element
# - 2. second dimension indicates that there are 12 heads in that last decoder
# - 3. the 13x13 (third and fourth) dimensions are our tokens.
# We have 13 tokens and they are placed in a square matrix to calculate the attention scores.
response.attentions[-1].shape

torch.Size([1, 12, 13, 13])

In [17]:
encoded_phrase['input_ids'].shape

torch.Size([1, 13])

In [18]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0])
tokens

['My',
 'Ġfriend',
 'Ġwas',
 'Ġright',
 'Ġabout',
 'Ġthis',
 'Ġclass',
 '.',
 'ĠIt',
 'Ġis',
 'Ġso',
 'Ġfun',
 '!']

In [19]:
# Layer index 9, head 0.
arr = response.attentions[9][0][0]

n_digits = 3

attention_df = pd.DataFrame((torch.round(arr * 10**n_digits) / (10**n_digits)).detach()).map(float)
attention_df.columns = tokens
attention_df.index = tokens

In [20]:
attention_df

Unnamed: 0,My,Ġfriend,Ġwas,Ġright,Ġabout,Ġthis,Ġclass,.,ĠIt,Ġis,Ġso,Ġfun,!
My,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġfriend,0.968,0.032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġwas,0.824,0.145,0.031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġright,0.979,0.008,0.007,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġabout,0.979,0.008,0.004,0.005,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġthis,0.924,0.031,0.007,0.006,0.016,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġclass,0.946,0.005,0.001,0.001,0.001,0.002,0.044,0.0,0.0,0.0,0.0,0.0,0.0
.,0.691,0.013,0.003,0.003,0.002,0.006,0.269,0.013,0.0,0.0,0.0,0.0,0.0
ĠIt,0.318,0.003,0.003,0.003,0.006,0.018,0.599,0.018,0.032,0.0,0.0,0.0,0.0
Ġis,0.331,0.006,0.002,0.002,0.003,0.018,0.533,0.013,0.062,0.03,0.0,0.0,0.0


In [21]:
model_view(response.attentions, tokens)

<IPython.core.display.Javascript object>