In [1]:
# we take a pretrainied GPT2 model and fine tune on Gullivers travels text

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt

from transformers import AutoModelForCausalLM, GPT2Tokenizer
import requests



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
gpt2  = AutoModelForCausalLM.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Loading weights: 100%|█| 148/148 [00:00<00:00, 1786.50it/s, Materializing param=
[1mGPT2LMHeadModel LOAD REPORT[0m from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [3]:
seq_len = 256
batch_size = 16

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [4]:
# tokenize the text
text = requests.get('https://www.gutenberg.org/cache/epub/829/pg289.txt').text

# the old way
gtTokens = torch.tensor(tokenizer.encode(text), dtype = torch.long)  # the output of .encode is Py list(), we then covnert to tensor
print(gtTokens.shape)

# a better way
gtTokens = tokenizer.encode(text, return_tensors='pt') # Now this outputs a Pytorch tensor ('pt')
# but this has a shape (singleton dimension)
print(gtTokens.shape)

# but rest of code ois setup for dimensionless tensors
gtTokens =gtTokens[0]
print(gtTokens.shape)

Token indices sequence length is longer than the specified maximum sequence length for this model (2809 > 1024). Running this sequence through the model will result in indexing errors


torch.Size([2809])
torch.Size([1, 2809])
torch.Size([2809])


In [15]:
# most freq 100 tokens
uniq, counts = np.unique(gtTokens, return_counts=True)
freqidx = np.argsort(counts)[::-1]
top100 = uniq[freqidx[:100]]

for t in top100:
    print(f'Token {t:5} appears {torch.sum(gtTokens==t)} times and is "{tokenizer.decode(t)}"')

Token   220 appears 730 times and is " "
Token   198 appears 156 times and is "
"
Token  2625 appears 124 times and is "=""
Token     1 appears 99 times and is """
Token    12 appears 78 times and is "-"
Token  1279 appears 75 times and is " <"
Token    13 appears 67 times and is "."
Token    29 appears 63 times and is ">"
Token    64 appears 49 times and is "a"
Token  5320 appears 39 times and is "">"
Token    14 appears 35 times and is "/"
Token  1875 appears 32 times and is " >"
Token  1398 appears 31 times and is " class"
Token  3556 appears 27 times and is "</"
Token 35922 appears 27 times and is "="/"
Token  7359 appears 26 times and is " </"
Token 13291 appears 26 times and is " href"
Token  7146 appears 22 times and is "div"
Token    62 appears 18 times and is "_"
Token    20 appears 16 times and is "5"
Token    70 appears 14 times and is "g"
Token 28961 appears 13 times and is "meta"
Token    11 appears 13 times and is ","
Token  2695 appears 13 times and is " content"
Token 1

In [16]:
gpt2 = gpt2.to(device)

In [20]:
gpt2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [22]:
prompt = 'I cannot believe that'
in2gpt = tokenizer.encode(prompt, return_tensors = 'pt').to(device)

output = gpt2.generate(in2gpt, max_length=100, pad_token_id=50256,do_sample=True).cpu()
print(tokenizer.decode(output[0]))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


I cannot believe that the State Department should feel threatened with these claims. They have made so many promises. I have not been able to find any records and I have nothing to prove to them of any such information. My concern is that they're going to force me to make public these conversations to the point where they can get a fair trial, I have no money for lawyers to help me and I don't want to see me killed. The last thing I want is if the President is going to


Quantify the freq of GT freq tokens in GPT's output

In [27]:
numreps = 10 # num of random repetitions
numtoks = 100 # oputput length

# we need model to generate 1000 toks, but the output should be meaningul
# if we ask to generate 1000 toks in a single seq, the model goes awkard
# so idea is to have more short seq than fewer long seq

# random starting tokens
randstarts = torch.randint(tokenizer.vocab_size, (numreps,1)).to(device) # this creates a 10x1 matrix [10 btaches of one single starting token]

out = gpt2.generate(
    randstarts,
    max_length  = numtoks+1, #the first token is the row start in randstarts, so you need 100+1 total toks in output of generate()
    min_length = numtoks+1, # guarantee that model should generarte exact;y 100 toks
    do_sample = True,
    bad_words_ids = [tokenizer.encode(tokenizer.eos_token)],
    pad_token_id = tokenizer.encode(tokenizer.eos_token)[0]).cpu()

print(out,'\n')

for o in out:
    print('\n*** Next batch ofg outoput')
    print(tokenizer.decode(o))

tensor([[18987,   508,   389,  ...,   383,  1812,  2732],
        [43375,   262,  1438,  ...,   418,   198,   198],
        [22983,   345,  1183,  ...,   422,   326,  1295],
        ...,
        [ 8515,   284,   383,  ..., 18708,  1201,   262],
        [27956,    13,   405,  ...,    13,  9746,    15],
        [17620,   287,  3012,  ...,  1720,  4433, 21771]]) 


*** Next batch ofg outoput
 Users who are willing to provide additional information on the issue: We are confident we'll find out right away.

The Department of Homeland Security, Office of the Inspector General for Terrorist Threats, and the Inspector General for Immigration, Refugees, and Citizenship, have already done their best to investigate these types of incidents. They're well-versed in the law, understand how incidents can rise quickly and are dedicated to responding to the highest standards of safety, security, and security. The State Department

*** Next batch ofg outoput
unsigned the name of the server. For example,

In [28]:
randstarts

tensor([[18987],
        [43375],
        [22983],
        [21365],
        [33019],
        [32545],
        [46765],
        [ 8515],
        [27956],
        [17620]], device='mps:0')

In [29]:
# calcualte and report the percentage
percentFreqTokens_pre = np.mean(100*np.isin(out[:,1:],top100).flatten())
print(f"Gullivers travels common tokens appeared in {percentFreqTokens_pre}% of new tokens")

Gullivers travels common tokens appeared in 26.3% of new tokens


Fine tune the model (to imporve the above percent, i.e to include more GT text in model outout')

In [30]:
optimizer = torch.optim.AdamW(gpt2.parameters(), lr=5e-5, weight_decay=.01) # here the learning rate is really small

#NOTE: IMP dont need loss func here, bcoz HF models calcualte loss fun internally 


In [40]:
num_samples = 1234

#init the loss
train_loss = np.zeros(num_samples)


for sampli in range(num_samples):
    ix = torch.randint(len(gtTokens)-seq_len,size=(batch_size,))
    X = gtTokens[ix[:,None]+ torch.arange(seq_len)]
    X = X.to(device)
    gpt2.zero_grad()
    # fwd pass (HF shifts X internally to get y)
    # all of X is shifted by 1 and used as labels for loss calcualtion
    # also inside model, H.F makes sure it uses NLLLoss as loss func
    # fwd pass
    outputs= gpt2(X,labels=X)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    
    # sum the batch loss
    train_loss[sampli] = loss.item()
    if sampli%77==0:
        print(f'Sample {sampli}/{num_samples}, train loss: {train_loss[sampli]}')

Sample 0/1234, train loss: 2.242738962173462
Sample 77/1234, train loss: 0.09334482252597809
Sample 154/1234, train loss: 0.04268364980816841
Sample 231/1234, train loss: 0.03527035936713219
Sample 308/1234, train loss: 0.030546387657523155
Sample 385/1234, train loss: 0.03709830343723297
Sample 462/1234, train loss: 0.028301220387220383
Sample 539/1234, train loss: 0.024902261793613434
Sample 616/1234, train loss: 0.029406629502773285
Sample 693/1234, train loss: 0.027280081063508987
Sample 770/1234, train loss: 0.0290165226906538
Sample 847/1234, train loss: 0.02049478515982628
Sample 924/1234, train loss: 0.022254755720496178
Sample 1001/1234, train loss: 0.021956220269203186
Sample 1078/1234, train loss: 0.0224310215562582
Sample 1155/1234, train loss: 0.026207711547613144
Sample 1232/1234, train loss: 0.016983982175588608


In [43]:
prompt = 'I cannot believe that'
in2gpt = tokenizer.encode(prompt, return_tensors = 'pt').to(device)

output = gpt2.generate(in2gpt, max_length=100, pad_token_id=50256,do_sample=True).cpu()
print(tokenizer.decode(output[0]))

I cannot believe that a website would think this."

"Maybe you think so. Don't you think? Don't you think that the website you are trying to reach has some kind of broken link you think may have occurred to you. Go to <a href="/">www.gutenberg.org</a> to see whether the error persists.</p>

<p>If you think something is broken, follow the âContact Informationâ�


In [42]:
# calcualte and report the percentage
percentFreqTokens_pst = np.mean(100*np.isin(out[:,1:],top100).flatten())
print(f"Common GT tokens usage went from {percentFreqTokens_pre:.2f}% to {percentFreqTokens_pst:.2f}% after fine tuning")

Common GT tokens usage went from 26.30% to 26.30% after fine tuning
