In [1]:
# here we have 2 neo models...one we freeze and one we dont

In [3]:
import numpy as np
import matplotlib.pyplot as plt


import torch
import torch.nn as nn
import torch.nn.functional as F

import textwrap

from transformers import AutoModelForCausalLM, AutoTokenizer
from torchinfo import summary
import requests
import time

In [4]:
# Eletuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
tokenizer.pad_token_id = tokenizer.encode(' ')[0]

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

modelFreeze = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m').to(device)
modelTrain = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m').to(device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Loading weights: 100%|█| 160/160 [00:00<00:00, 1238.05it/s, Materializing param=
[1mGPTNeoForCausalLM LOAD REPORT[0m from: EleutherAI/gpt-neo-125m
Key                                                   | Status     |  | 
------------------------------------------------------+------------+--+-
transformer.h.{0...11}.attn.attention.masked_bias     | UNEXPECTED |  | 
transformer.h.{0, 2, 4, 6, 8, 10}.attn.attention.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Loading weights: 100%|█| 160/160 [00:00<00:00, 1560.12it/s, Materializing param=
[1mGPTNeoForCausalLM LOAD REPORT[0m from: EleutherAI/gpt-neo

Import moby dick book and find the most 100 common tokens

In [5]:
text = requests.get('https://www.gutenberg.org/cache/epub/2701/pg2701.txt').text
tokens = tokenizer.encode(text, return_tensors='pt')[0]
print(f'Moby dick has {len(tokens)} tokens, of which {len(torch.unique(tokens)):,} are unique')

Token indices sequence length is longer than the specified maximum sequence length for this model (354293 > 2048). Running this sequence through the model will result in indexing errors


Moby dick has 354293 tokens, of which 17,259 are unique


In [6]:
# most freq 100 tokens
uniq, counts = np.unique(tokens, return_counts=True)
freqidx = np.argsort(counts)[::-1]
top100 = uniq[freqidx[:100]]

for t in top100:
    print(f'Token {t:5} appears {torch.sum(tokens==t)} times and is "{tokenizer.decode(t)}"')

"oken   201 appears 22310 times and is "
Token   198 appears 22310 times and is "
"
Token    11 appears 19216 times and is ","
Token   262 appears 13157 times and is " the"
Token    13 appears 7901 times and is "."
Token   286 appears 6402 times and is " of"
Token   290 appears 5707 times and is " and"
Token   447 appears 5359 times and is "�"
Token   257 appears 4533 times and is " a"
Token   284 appears 4437 times and is " to"
Token    26 appears 4167 times and is ";"
Token   287 appears 3853 times and is " in"
Token   247 appears 2796 times and is "�"
Token   326 appears 2749 times and is " that"
Token    12 appears 2584 times and is "-"
Token    82 appears 2393 times and is "s"
Token   465 appears 2351 times and is " his"
Token   340 appears 2108 times and is " it"
Token   314 appears 1869 times and is " I"
Token     0 appears 1763 times and is "!"
Token   318 appears 1657 times and is " is"
Token   250 appears 1628 times and is "�"
Token   351 appears 1600 times and is " with"
Tok

In [9]:
numreps = 10 # num of random repetitions
numtoks = 100 # oputput length

tokenUsage = np.zeros((2,2)) # [pre/post, freeze/train]
# random starting tokens
randstarts = torch.randint(tokenizer.vocab_size, (numreps,1)).to(device) # this creates a 10x1 matrix [10 btaches of one single starting token]

# FREEZE: generate and store tokens
outFreeze = modelFreeze.generate(
    randstarts,
    max_length  = numtoks+1, #the first token is the row start in randstarts, so you need 100+1 total toks in output of generate()
    min_length = numtoks+1, # guarantee that model should generarte exact;y 100 toks
    do_sample = True,
    pad_token_id = tokenizer.encode(tokenizer.eos_token)[0]).cpu()
genTokensFreeze = outFreeze[:,1:].reshape(-1)


# TRAIN: same as above
outTrain = modelTrain.generate(
    randstarts,
    max_length  = numtoks+1, #the first token is the row start in randstarts, so you need 100+1 total toks in output of generate()
    min_length = numtoks+1, # guarantee that model should generarte exact;y 100 toks
    do_sample = True,
    pad_token_id = tokenizer.encode(tokenizer.eos_token)[0]).cpu()
genTokensTrain = outTrain[:,1:].reshape(-1)


tokenUsage[0,0] = np.mean(100*np.isin(genTokensFreeze,top100))
tokenUsage[0,1] = np.mean(100*np.isin(genTokensTrain,top100))


In [10]:
tokenUsage

array([[45. , 44.8],
       [ 0. ,  0. ]])

Now do targeted precision freezing

For eg: we do freezing for attention weights(QKV) in layers upto 6 and from 6+ train them

This means we train on only the later layers
|

In [11]:
for name, param in modelFreeze.named_parameters():
    splitstr = name.split('.')
    print(splitstr)

['transformer', 'wte', 'weight']
['transformer', 'wpe', 'weight']
['transformer', 'h', '0', 'ln_1', 'weight']
['transformer', 'h', '0', 'ln_1', 'bias']
['transformer', 'h', '0', 'attn', 'attention', 'k_proj', 'weight']
['transformer', 'h', '0', 'attn', 'attention', 'v_proj', 'weight']
['transformer', 'h', '0', 'attn', 'attention', 'q_proj', 'weight']
['transformer', 'h', '0', 'attn', 'attention', 'out_proj', 'weight']
['transformer', 'h', '0', 'attn', 'attention', 'out_proj', 'bias']
['transformer', 'h', '0', 'ln_2', 'weight']
['transformer', 'h', '0', 'ln_2', 'bias']
['transformer', 'h', '0', 'mlp', 'c_fc', 'weight']
['transformer', 'h', '0', 'mlp', 'c_fc', 'bias']
['transformer', 'h', '0', 'mlp', 'c_proj', 'weight']
['transformer', 'h', '0', 'mlp', 'c_proj', 'bias']
['transformer', 'h', '1', 'ln_1', 'weight']
['transformer', 'h', '1', 'ln_1', 'bias']
['transformer', 'h', '1', 'attn', 'attention', 'k_proj', 'weight']
['transformer', 'h', '1', 'attn', 'attention', 'v_proj', 'weight']
[

In [15]:
# test: idfy QKV weights of layer >5

for name, param in modelFreeze.named_parameters():
    splitstr = name.split('.')
    if (len(splitstr)>5) and (splitstr[3]=='attn'):
        if(int(splitstr[2])>5) and (splitstr[5][0] in 'qvk'):
            print(name)


transformer.h.6.attn.attention.k_proj.weight
transformer.h.6.attn.attention.v_proj.weight
transformer.h.6.attn.attention.q_proj.weight
transformer.h.7.attn.attention.k_proj.weight
transformer.h.7.attn.attention.v_proj.weight
transformer.h.7.attn.attention.q_proj.weight
transformer.h.8.attn.attention.k_proj.weight
transformer.h.8.attn.attention.v_proj.weight
transformer.h.8.attn.attention.q_proj.weight
transformer.h.9.attn.attention.k_proj.weight
transformer.h.9.attn.attention.v_proj.weight
transformer.h.9.attn.attention.q_proj.weight
transformer.h.10.attn.attention.k_proj.weight
transformer.h.10.attn.attention.v_proj.weight
transformer.h.10.attn.attention.q_proj.weight
transformer.h.11.attn.attention.k_proj.weight
transformer.h.11.attn.attention.v_proj.weight
transformer.h.11.attn.attention.q_proj.weight


In [14]:
print('s' in 'qvk')
print('q' in 'qvk')

False
True


In [17]:
for name,param in modelFreeze.named_parameters():
    splitstr = name.split('.')
    if (len(splitstr)>5) and (splitstr[3]=='attn'):
        if(int(splitstr[2])>5) and (splitstr[5][0] in 'qvk'):
            param.requires_grad = True
            print(f'+++ Layer {name} is trainable (.requires_grad = {param.requires_grad}')
    else:
        param.requires_grad = False
        print(f'--- Layer {name} is frozen (.requires_grad = {param.requires_grad}')

--- Layer transformer.wte.weight is frozen (.requires_grad = False
--- Layer transformer.wpe.weight is frozen (.requires_grad = False
--- Layer transformer.h.0.ln_1.weight is frozen (.requires_grad = False
--- Layer transformer.h.0.ln_1.bias is frozen (.requires_grad = False
--- Layer transformer.h.0.ln_2.weight is frozen (.requires_grad = False
--- Layer transformer.h.0.ln_2.bias is frozen (.requires_grad = False
--- Layer transformer.h.0.mlp.c_fc.weight is frozen (.requires_grad = False
--- Layer transformer.h.0.mlp.c_fc.bias is frozen (.requires_grad = False
--- Layer transformer.h.0.mlp.c_proj.weight is frozen (.requires_grad = False
--- Layer transformer.h.0.mlp.c_proj.bias is frozen (.requires_grad = False
--- Layer transformer.h.1.ln_1.weight is frozen (.requires_grad = False
--- Layer transformer.h.1.ln_1.bias is frozen (.requires_grad = False
--- Layer transformer.h.1.ln_2.weight is frozen (.requires_grad = False
--- Layer transformer.h.1.ln_2.bias is frozen (.requires_grad = 

Now train the models

In [22]:
# here we also track learning related changes in a random weight (h[6].attn.k) and also time the training [using delta norm]
# for both frozen and non frozen models

In [20]:

optimizerFreeze = torch.optim.AdamW(modelFreeze.parameters(), lr=.0005)
optimizerTrain = torch.optim.AdamW(modelTrain.parameters(), lr=.0005)

In [21]:
seq_len = 256
batch_size = 16
num_samples = 100

In [23]:
losses = np.zeros((num_samples,2))
delta_norm_em = np.zeros((num_samples,2))

timeTrain = 0
timeFreeze = 0

# grab the initial MLP weights for comparison
prev_emFreeze = modelFreeze.transformer.h[6].attn.attention.k_proj.weight.detach() + 0
prev_emTrain = modelTrain.transformer.h[6].attn.attention.k_proj.weight.detach() + 0


for sampli in range(num_samples):
    ix = torch.randint(len(tokens)-seq_len,size=(batch_size,))
    X = tokens[ix[:,None]+ torch.arange(seq_len)].to(device)

    # FREEZE fine tuining 
    # fwd pass and get loss
    start_time = time.time()
    modelFreeze.zero_grad()
    outputs= modelFreeze(X,labels=X)

    #backrpop and store loss
    outputs.loss.backward()
    optimizerFreeze.step()
    losses[sampli, 0] = outputs.loss.item()
    timeFreeze += time.time() - start_time
    ###c------------


    # TRAIN fine tuining 
    # fwd pass and get loss
    start_time = time.time()
    modelTrain.zero_grad()
    outputs= modelTrain(X,labels=X)

    #backrpop and store loss
    outputs.loss.backward()
    optimizerTrain.step()
    losses[sampli, 1] = outputs.loss.item()
    timeTrain += time.time() - start_time
    ###c------------

    #matrix norm to asses change in MLP layer
    delta_norm_em[sampli,0] = torch.norm(modelFreeze.transformer.h[6].attn.attention.k_proj.weight.detach() - prev_emFreeze)
    prev_emFreeze = modelFreeze.transformer.h[6].attn.attention.k_proj.weight.detach()

    delta_norm_em[sampli,1] = torch.norm(modelTrain.transformer.h[6].attn.attention.k_proj.weight.detach() - prev_emTrain)
    prev_emTrain = modelTrain.transformer.h[6].attn.attention.k_proj.weight.detach() + 0
    
    # sum the batch loss

    if sampli%25==0:
        print(f'Sample {sampli}/{num_samples}, losses (Freeze/Train):: {losses[sampli,0]:.2f} / {losses[sampli,1]:.2f}')

Sample 0/100, losses (Freeze/Train):: 3.77 / 3.77
Sample 25/100, losses (Freeze/Train):: 3.30 / 3.37
Sample 50/100, losses (Freeze/Train):: 3.42 / 3.46
Sample 75/100, losses (Freeze/Train):: 3.27 / 3.00


In [24]:
# 
#plot of losses


![title](../images/losses_freeze_train.png)

In [26]:
#plotting the percent of common100 tokens of moby dick

# this is not expected ---ideally fully trained model would be picking most100

![title](../images/common100Mobydick.png)

In [29]:
# hwo the attention weights changed over training (norm thing)

# plot shows the weights change a lot initially in the training and becomes stable iduringg later parts of training
# graph also shows most of chnages happen in the layers trainnabel

![title](../images/normChange.png)

In [31]:
# plot to show the trainnig times

![title](../images/trainTimes.png)