In [1]:
# this uses GPT-neo and is finetuned on 2 different styles
# alice in wonderland and edgar alan poe style


In [2]:
import numpy as np
import matplotlib.pyplot as plt


import torch
import torch.nn as nn
import torch.nn.functional as F

import textwrap

from transformers import AutoModelForCausalLM, AutoTokenizer
from torchinfo import summary
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Eletuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
tokenizer.pad_token_id = tokenizer.encode(' ')[0]

# load in 2 GPTneos and push to GPU
modelAlice = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
modelEdgar = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
modelAlice = modelAlice.to(device)
modelEdgar = modelEdgar.to(device)

Loading weights: 100%|█| 160/160 [00:00<00:00, 1711.82it/s, Materializing param=
[1mGPTNeoForCausalLM LOAD REPORT[0m from: EleutherAI/gpt-neo-125m
Key                                                   | Status     |  | 
------------------------------------------------------+------------+--+-
transformer.h.{0, 2, 4, 6, 8, 10}.attn.attention.bias | UNEXPECTED |  | 
transformer.h.{0...11}.attn.attention.masked_bias     | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Loading weights: 100%|█| 160/160 [00:00<00:00, 1159.89it/s, Materializing param=
[1mGPTNeoForCausalLM LOAD REPORT[0m from: EleutherAI/gpt-neo-125m
Key                                                   | Status     |  | 
------------------------------------------------------+------------+--+-
transformer.h.{0, 2, 4, 6, 8, 10}.attn.attention.bias | UNEXPECTED |  | 
transformer.h.{0...11}.attn.attention.masked_bias     | UN

inspect the model

In [6]:
modelAlice
# here the q,k,v matrix is seperate in attenion unlike OpenAI GPT2 ewhere it was all 1 big matrix


GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_fe

In [7]:
# accessign a particular weights matrix
modelAlice.transformer.h[3].attn.attention.k_proj.weight.shape

torch.Size([768, 768])

In [10]:
modelAlice.lm_head.weight.shape

torch.Size([50257, 768])

In [11]:
# model summar y
x = tokenizer.encode('What did the Red Queen say to Alice?', return_tensors='pt').to(device)
summary(modelAlice, input_data=x, col_names=['input_size', 'output_size', 'num_params'])

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #
GPTNeoForCausalLM                                       [1, 9]                    --                        --
├─GPTNeoModel: 1-1                                      [1, 9]                    --                        --
│    └─Embedding: 2-1                                   [1, 9]                    [1, 9, 768]               38,597,376
│    └─Embedding: 2-2                                   [1, 9]                    [1, 9, 768]               1,572,864
│    └─Dropout: 2-3                                     [1, 9, 768]               [1, 9, 768]               --
│    └─ModuleList: 2-4                                  --                        --                        --
│    │    └─GPTNeoBlock: 3-1                            [1, 9, 768]               [1, 9, 768]               7,085,568
│    │    └─GPTNeoBlock: 3-2                            [1, 9, 768]               [1,

In [12]:
# are the embedds and unembedds tied?
print('** Embedding:\n', modelAlice.transformer.wte.weight.detach())
print('\n\n ** Unmbedding:\n', modelAlice.lm_head.weight.detach())

** Embedding:
 tensor([[ 0.1709, -0.7383,  0.4277,  ...,  0.0840,  0.5820, -0.3457],
        [ 0.2070, -0.6055,  0.4590,  ...,  0.1562,  0.4883, -0.2363],
        [ 0.2324, -0.6367,  0.3262,  ...,  0.2236,  0.7500, -0.2354],
        ...,
        [ 0.7734, -1.1406,  0.6523,  ...,  0.2832,  0.9258, -0.5547],
        [ 0.3906, -0.8438,  0.5117,  ...,  0.0148,  0.6992, -0.2383],
        [ 0.2734, -0.7148,  0.2949,  ...,  0.1748,  0.4043, -0.3105]],
       device='mps:0')


 ** Unmbedding:
 tensor([[ 0.1709, -0.7383,  0.4277,  ...,  0.0840,  0.5820, -0.3457],
        [ 0.2070, -0.6055,  0.4590,  ...,  0.1562,  0.4883, -0.2363],
        [ 0.2324, -0.6367,  0.3262,  ...,  0.2236,  0.7500, -0.2354],
        ...,
        [ 0.7734, -1.1406,  0.6523,  ...,  0.2832,  0.9258, -0.5547],
        [ 0.3906, -0.8438,  0.5117,  ...,  0.0148,  0.6992, -0.2383],
        [ 0.2734, -0.7148,  0.2949,  ...,  0.1748,  0.4043, -0.3105]],
       device='mps:0')


Explore the tokenizer

In [13]:
# a bit about their tokenizer
print(f'Tokenizer has {tokenizer.vocab_size:,} tokens. \nA few radnom tokens:\n')
for i in range(30):
    randtok = torch.randint(tokenizer.vocab_size,(1,))
    print(f'Token {randtok[0]:5} is "{tokenizer.decode(randtok)}"')

Tokenizer has 50,257 tokens. 
A few radnom tokens:

Token 31680 is " caster"
Token  8715 is "dated"
Token 16206 is "oven"
Token  5710 is " dropped"
Token 42713 is " Wonderland"
Token 41404 is " veterinarian"
Token 40373 is "ISSION"
Token 38888 is "voice"
Token 34313 is " politely"
Token  6091 is " Haw"
Token 33011 is "olla"
Token  7304 is "pload"
Token  5766 is " factor"
Token 18636 is "ipes"
Token 23916 is " cans"
Token 13623 is " rats"
Token 13054 is " barrier"
Token 14461 is " marginal"
Token 34357 is " craving"
Token 13689 is "Earlier"
Token 30471 is " Subaru"
Token 46668 is " ti"
Token  9391 is " ingredients"
Token   404 is "op"
Token 30660 is " Ogre"
Token  9332 is " efficiency"
Token 15381 is " shifts"
Token 22501 is "strous"
Token 24146 is " Sniper"
Token 39550 is " attaching"


In [14]:
# this tokenizer is smae as GPT2 tokenizer

import and process texts

In [16]:
# throught the looking glass (aka alice in wonderland)
text = requests.get('https://www.gutenberg.org/cache/epub/11/pg11.txt').text
aliceTokens = torch.tensor(tokenizer.encode(text), dtype=torch.long)

# edgar allan Poe
text = requests.get('https://www.gutenberg.org/cache/epub/2148/pg2148.txt').text
edgarTokens = torch.tensor(tokenizer.encode(text), dtype=torch.long)

Token indices sequence length is longer than the specified maximum sequence length for this model (52954 > 2048). Running this sequence through the model will result in indexing errors


In [19]:
print(f'Allice in wonderland has {len(aliceTokens):,} tokens.')
print(f'Edgar Alan Poe has {len(edgarTokens):,} tokens.')

Allice in wonderland has 52,954 tokens.
Edgar Alan Poe has 197,306 tokens.


Prepare for fine tuning

In [21]:
# ALICE optmizer
optimizerAlice = torch.optim.AdamW(modelAlice.parameters(), lr=5e-5, weight_decay=.01)
optimizerEdgar = torch.optim.AdamW(modelEdgar.parameters(), lr=5e-5, weight_decay=.01)

In [22]:
seq_len = 256
batch_size = 16
num_samples = 476


fine tune model

In [26]:
tokenProbs = np.zeros((num_samples,3))

lossAlice = np.zeros(num_samples)
lossEdgar = np.zeros(num_samples)

for sampli in range(num_samples):
    # init batch losses to accumulate

    # ALICE fine tuning
    # get a batch of data
    ix = torch.randint(len(aliceTokens)-seq_len, size = (batch_size,))
    X = aliceTokens[ix[:,None] + torch.arange(seq_len)].to(device)

    #fwd pass and get loss
    modelAlice.zero_grad()
    outputs = modelAlice(X, labels=X)

    # backprop and store loss
    outputs.loss.backward()
    optimizerAlice.step()
    lossAlice[sampli] = outputs.loss.item()


    #EDGAR fine tuning
    ix = torch.randint(len(edgarTokens)-seq_len, size = (batch_size,))
    X = edgarTokens[ix[:,None] + torch.arange(seq_len)].to(device)

    #fwd pass and get loss
    modelEdgar.zero_grad()
    outputs = modelEdgar(X, labels=X)

    # backprop and store loss
    outputs.loss.backward()
    optimizerAlice.step()
    lossEdgar[sampli] = outputs.loss.item()

    if sampli%25==0:
        print(f'Sample: {sampli}/{num_samples}, losses (Alice/eDgar): {lossAlice[sampli]} / {lossEdgar[sampli]}')

Sample: 0/476, losses (Alice/eDgar): 2.42812442779541 / 2.6880459785461426
Sample: 25/476, losses (Alice/eDgar): 1.8133280277252197 / 2.61869740486145
Sample: 50/476, losses (Alice/eDgar): 1.6113585233688354 / 2.7559566497802734
Sample: 75/476, losses (Alice/eDgar): 1.3369135856628418 / 2.6015217304229736
Sample: 100/476, losses (Alice/eDgar): 1.0890058279037476 / 2.6897029876708984
Sample: 125/476, losses (Alice/eDgar): 0.7399564385414124 / 2.6269607543945312
Sample: 150/476, losses (Alice/eDgar): 0.6405403017997742 / 2.6760103702545166
Sample: 175/476, losses (Alice/eDgar): 0.5025162100791931 / 2.5033085346221924
Sample: 200/476, losses (Alice/eDgar): 0.34113582968711853 / 2.642308473587036
Sample: 225/476, losses (Alice/eDgar): 0.22491006553173065 / 2.7783093452453613
Sample: 250/476, losses (Alice/eDgar): 0.24024367332458496 / 2.6661019325256348
Sample: 275/476, losses (Alice/eDgar): 0.19655752182006836 / 2.6313257217407227
Sample: 300/476, losses (Alice/eDgar): 0.17590542137622833

In [27]:
# alice book had better learning than Edgar
# bcoz Alice is from 1 single book, edgar is from colelction of his poems
# alice book is more homogenous in terms of writing style and content


Qualitative assessment

In [28]:
# model summar y
x = tokenizer.encode('What did the Red Queen say to Alice?', return_tensors='pt').to(device)

outAlice = modelAlice.generate(x,max_new_tokens=120, do_sample=True, pad_token_id=50257)
outEdgar = modelEdgar.generate(x,max_new_tokens=120, do_sample=True, pad_token_id=50257)


#print both models outputs
print('** Alice model says:')
print(tokenizer.decode(outAlice[0].cpu()))

print('\n\n** Edgar model says:')
print(tokenizer.decode(outEdgar[0].cpu()))

** Alice model says:
What did the Red Queen say to Alice?”

“Yes, she said ‘Will you be _very_ pleased? She is such a dear
quiet thing,’” said Alice, rubbing her eyes, and asking herself how
anybody would have understood the young lady thought she was
replicating her thoughts.

“She is an only child,” the Duchess explained.

“And because of her,” said the Duchess, “it _must_ have been for
her having been denied the opportunity of


** Edgar model says:
What did the Red Queen say to Alice?

The Red Queen’s letter begins by asking Alice to read, and she asks Alice, in her most direct and obvious way to have “this great day”, to read it; and it ends by asking Alice to get it out of there while Alice continues.
HENRY, WENDY, BLESSING, and BEER. The two men meet in a field, each wearing something blue around his head.

A. The two men first meet Alice’s brother, who tells them they can’t read anything they’re
