In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Loading weights: 100%|â–ˆ| 148/148 [00:00<00:00, 1886.36it/s, Materializing param=
[1mGPT2LMHeadModel LOAD REPORT[0m from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Pad token

In [18]:
# GPT2 doesnot have pad token
print('Current pad token:', tokenizer.pad_token)
print('Current eos token:', tokenizer.eos_token)

# common (thought not necessarly ideal) to set pad_token = eos_token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

print('Current pad token:', tokenizer.pad_token)

Current pad token: None
Current eos token: <|endoftext|>
Current pad token: <|endoftext|>


Encoding

In [21]:
sentences = [
    'Every happy family is the same',
    'every unhappy family is unhappy in its own way.',
    'Why did the chicken cross the road?']
# these sentences have diff no of tokens

# tokenize the padding and attention mask
toks = tokenizer(sentences,
                 return_tensors='pt',
                 padding=True) #this is important
toks
#50256 padding tokens (only because we tied it to EOS token 50256 is EOS token actually

# also look into the attention masks for 1st and 2nd sentence where it was padded

{'input_ids': tensor([[ 6109,  3772,  1641,   318,   262,   976, 50256, 50256, 50256, 50256],
        [16833, 19283,  1641,   318, 19283,   287,   663,   898,   835,    13],
        [ 5195,   750,   262,  9015,  3272,   262,  2975,    30, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}

In [22]:
tokenizer.decode([50256])

'<|endoftext|>'

In [23]:
for i in range(len(sentences)):
    inputs = toks['input_ids'][i]
    mask = toks['attention_mask'][i]
    print(f'"{sentences[i]}":')
    print(inputs,'\n',mask,'\n')

"Every happy family is the same":
tensor([ 6109,  3772,  1641,   318,   262,   976, 50256, 50256, 50256, 50256]) 
 tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0]) 

"every unhappy family is unhappy in its own way.":
tensor([16833, 19283,  1641,   318, 19283,   287,   663,   898,   835,    13]) 
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) 

"Why did the chicken cross the road?":
tensor([ 5195,   750,   262,  9015,  3272,   262,  2975,    30, 50256, 50256]) 
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0]) 



generate some text

In [51]:
outputs = model.generate(
    input_ids = toks['input_ids'],
    attention_mask = toks['attention_mask'],
    pad_token_id = tokenizer.pad_token_id,
    max_length = 66,
    num_return_sequences=1,
    do_sample =True,
    top_k = 50,
    top_p = .95)
outputs

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


tensor([[ 6109,  3772,  1641,   318,   262,   976, 50256, 50256, 50256, 50256,
           198,   198,    40,   561,   910,   345,   821,   826,   986,   314,
          1101,  7926,   986,   475,   340,   338,   691,   262,   835,   356,
          2107,    13,   314,  1101,  7926,    13,   314,  1101,  7926,    13,
           314,  1053,   587,  7926,   329,  1115,   812,    13,   314,  1053,
           587,  7926,   329,  1115,  7028,    13,   843,   314,  1101,   655,
          7926,   546,   340,    11,   290,   314],
        [16833, 19283,  1641,   318, 19283,   287,   663,   898,   835,    13,
           632,   318,   477,   262,   976,   284,   606,    13,  1081,   890,
           355,   262,  1641,   389, 19283,    11,   262, 19283,  1641,   318,
           691, 19283,   611,   340,   318, 19283,   287,   663,   898,   835,
            13,  1649,   484,   765,   617,   584,  1611,   286, 12157,    11,
           484,   910,    11,   366,  2949,    11,   286,  1781,   407,   526,


In [44]:
tokenizer.pad_token_id

50256

In [45]:
outputs.shape

torch.Size([3, 66])

In [37]:
s = 'Every happy family is the same'
t = tokenizer(s,return_tensors='pt')
print(t)
o = model.generate(
    input_ids = t['input_ids'],
    attention_mask = t['attention_mask'],
    pad_token_id = tokenizer.pad_token_id,
    max_length = 7,
    num_return_sequences=2,
    do_sample =True,
    top_k = 50,
    top_p = .95)
print('Out:',o)

{'input_ids': tensor([[6109, 3772, 1641,  318,  262,  976]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
Out: tensor([[6109, 3772, 1641,  318,  262,  976,   13],
        [6109, 3772, 1641,  318,  262,  976,  355]])


decode

In [52]:
decoded_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
for i , text in enumerate(decoded_texts):
    print(f'** Text {i+1}: \n{text}\n')

** Text 1: 
Every happy family is the same

I would say you're right... I'm sorry... but it's only the way we live. I'm sorry. I'm sorry. I've been sorry for three years. I've been sorry for three seasons. And I'm just sorry about it, and I

** Text 2: 
every unhappy family is unhappy in its own way. It is all the same to them. As long as the family are unhappy, the unhappy family is only unhappy if it is unhappy in its own way. When they want some other kind of happiness, they say, "No, of course not." The happiest family is happiest if

** Text 3: 
Why did the chicken cross the road?There is an explanation: a large number of dogs cross the highway. It is the most common cross-breed in southern Russia. In fact, in February of this year, a video appeared showing the dog crossing a bridge from where a bridge was supposed to be built, and



In [55]:
# or simplerL
os = model.generate(toks['input_ids'])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [56]:
os.shape

torch.Size([3, 30])

In [57]:
os

tensor([[ 6109,  3772,  1641,   318,   262,   976, 50256, 50256, 50256, 50256,
           464,   471,    13,    50,    13,  2732,   286,  4796,   468,  4884,
           257, 33262,   284,   262,  1664,   326, 12216,   262,  2351,  4765],
        [16833, 19283,  1641,   318, 19283,   287,   663,   898,   835,    13,
           198,   198,     1,    40,   892,   340,   338,   257,   845,  1593,
          2071,   329,   262,  1499,    13,   632,   338,   257,   845,  1593],
        [ 5195,   750,   262,  9015,  3272,   262,  2975,    30, 50256, 50256,
           464,   471,  1546,    47, 32603,   784,  3406,  2723,   329,   383,
         15624, 28859,  1201,  8735,   198,   198,  1212,  2708,   318,   546]])