<a href="https://colab.research.google.com/github/soumyadip1995/BabyGPT/blob/main/Notebook/Quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In the BabyGPTmodel, the parametres has been changed to make use of quantization as given by the pytorch model. For this notebook , we currently have about 3.22M parametres.

In [39]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from dataclasses import dataclass
from typing import Optional

# hyperparameters

@dataclass
class GPTConfig:
    # these are default GPT-2 hyperparameters
    block_size: int = 1024
    vocab_size: int = 50304
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    bias :bool = False
    device: bool =  0
    


### other hyperparametres
batch_size = 16 
max_iters = 5000
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_interval = 500
eval_iters = 200
dropout = 0.2


torch.manual_seed(1337)


words = open(r"/content/ALL_eminem.txt", 'r', encoding='utf-8').read()


chars = sorted(list(set(words)))
vocab_size = len(chars)


string2integer = {ch: i for i, ch in enumerate(chars)}
integer2string = {i:ch for ch,i in string2integer.items()}
encode = lambda s: [string2integer[c] for c in s]
decode = lambda l: ''.join([integer2string[i] for i in l])
data = torch.tensor(encode(words), dtype = torch.long)


## train and split the data
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - config.block_size, (batch_size,))
    x = torch.stack([data[i:i+ config.block_size] for i in ix])
    y = torch.stack([data[i+1:i+ config.block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


### from pytorch GPT tutorial
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


torch.manual_seed(1337)
class Attention(nn.Module):
  def __init__(self, config):
    super(Attention, self).__init__()

    assert config.n_embd % config.n_head == 0

    self.atten = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
    self.projection = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
    self.n_head = config.n_head
    self.n_embd = config.n_embd
    self.register_buffer('tril', torch.tril(torch.ones(config.block_size, config.block_size)))

  def forward(self, x):
    B,T,C = x.size()
    q, k ,v  = self.atten(x).split(self.n_embd, dim=2)
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)


    # manual implementation of attention
    # from karpathy
    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
    att = att.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    att = F.softmax(att, dim=-1)
    y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

    # output projection
    y = self.projection(y)
    return y

dropout = 0.2
class FeedForward(nn.Module):
  def __init__(self,config):
    super(FeedForward, self).__init__()
    self.net = nn.Sequential(nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias),
    nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias),
    nn.GELU(),
    nn.Dropout(dropout))

  def forward(self, x):
    return self.net(x)

### A simple Transformer Block    
class Transformer(nn.Module):
  def __init__(self,config):
    super(Transformer, self).__init__()
    self.attention = Attention(config)
    self.feed_forward = FeedForward(config)
    self.layer_norm_1 = nn.LayerNorm(config.n_embd)
    self.layer_norm_2 = nn.LayerNorm(config.n_embd)

  def forward(self, x):
    
    x = x + self.attention(self.layer_norm_1(x))
    x = x + self.feed_forward(self.layer_norm_2(x))
    return x


class BabyGPTmodel(nn.Module):

    def __init__(self, config):
        super(BabyGPTmodel, self).__init__()

        assert config.vocab_size is not None
        assert config.block_size is not None

        self.config = config
        self.token = nn.Embedding(config.vocab_size, config.n_embd)
        self.positional_embeddings = nn.Embedding(config.block_size, config.n_embd)
        self.blocks = nn.Sequential(*[Transformer(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embd, eps = 1e-12) # final layer norm
        self.lnum_heads = nn.Linear(config.n_embd, config.vocab_size)

        ## init all weights
        ## from karpathy
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
          if pn.endswith('projection.weight'):
            torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters
        print("number of parameters: %d" % (sum(p.nelement() for p in self.parameters()),))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))

    def forward(self, idx, targets=None):
        device = idx.device
        B, T = idx.shape
        tok_emb = self.token(idx)
        position_ids = torch.arange(0, T, dtype = torch.long, device = device).unsqueeze(0)
        pos_emb =  self.positional_embeddings(position_ids)
        x = tok_emb + pos_emb
        for block in self.blocks:
          x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lnum_heads(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

        
    ## from karpathy's youtube videos.
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -config.block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


config = GPTConfig(
    block_size = 64,
    vocab_size = len(chars),
    n_head = 4,
    n_layer = 4,
    n_embd = 256)

model = BabyGPTmodel(config)

m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])



for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

torch.save(model.state_dict(), '/content/model.pth')


model.load_state_dict(torch.load('/content/model.pth'))
print(model.eval())

number of parameters: 3222637
Model's state_dict:
token.weight 	 torch.Size([109, 256])
positional_embeddings.weight 	 torch.Size([64, 256])
blocks.0.attention.tril 	 torch.Size([64, 64])
blocks.0.attention.atten.weight 	 torch.Size([768, 256])
blocks.0.attention.projection.weight 	 torch.Size([256, 256])
blocks.0.feed_forward.net.0.weight 	 torch.Size([1024, 256])
blocks.0.feed_forward.net.1.weight 	 torch.Size([256, 1024])
blocks.0.layer_norm_1.weight 	 torch.Size([256])
blocks.0.layer_norm_1.bias 	 torch.Size([256])
blocks.0.layer_norm_2.weight 	 torch.Size([256])
blocks.0.layer_norm_2.bias 	 torch.Size([256])
blocks.1.attention.tril 	 torch.Size([64, 64])
blocks.1.attention.atten.weight 	 torch.Size([768, 256])
blocks.1.attention.projection.weight 	 torch.Size([256, 256])
blocks.1.feed_forward.net.0.weight 	 torch.Size([1024, 256])
blocks.1.feed_forward.net.1.weight 	 torch.Size([256, 1024])
blocks.1.layer_norm_1.weight 	 torch.Size([256])
blocks.1.layer_norm_1.bias 	 torch.Size([2

In [None]:
!pip install transformers

In [40]:
import torch.quantization
import torch

import pandas 

quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
quantized_model.eval()



BabyGPTmodel(
  (token): Embedding(109, 256)
  (positional_embeddings): Embedding(64, 256)
  (blocks): Sequential(
    (0): Transformer(
      (attention): Attention(
        (atten): DynamicQuantizedLinear(in_features=256, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
        (projection): DynamicQuantizedLinear(in_features=256, out_features=256, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
      )
      (feed_forward): FeedForward(
        (net): Sequential(
          (0): DynamicQuantizedLinear(in_features=256, out_features=1024, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
          (1): DynamicQuantizedLinear(in_features=1024, out_features=256, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
          (2): GELU(approximate='none')
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (layer_norm_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (layer_norm_2): LayerNorm((256,), eps=1e-05, elementwise_affine=Tru

In [None]:


PATH = '/content/model.pth'
state = {'model': model.state_dict()}
torch.save(state, PATH)
model.load_state_dict(torch.load(PATH)['model'])
# print weights
for k, v in model.named_parameters():

  weights = k, v
  print(weights)

torch.save(model.state_dict(), '/content/model.weight')


In [41]:
import os
import torch

def print_model_size(model):
    torch.save(model.state_dict(), "/content/model.pth")
    print("%.4f MB" %(os.path.getsize("/content/model.pth")/1e6))
    

print_model_size(model)
print_model_size(quantized_model)

12.9688 MB
3.4608 MB


In [42]:
model_fp32 = BabyGPTmodel(config)
model_fp32.eval()
model_int8 = torch.ao.quantization.quantize_dynamic(
    model_fp32,  # the original model
    {torch.nn.Linear},  # a set of layers to dynamically quantize
    dtype=torch.qint8)  

def print_model_size(model):
    torch.save(model.state_dict(), "/content/model.pth")
    print("%.4f MB" %(os.path.getsize("/content/model.pth")/1e6))
    

print_model_size(model)
print_model_size(model_int8)

number of parameters: 3222637
12.9688 MB
3.4603 MB


In [43]:
## A difference of a factor of 3.75 post quantization.

c = 13/3.46
c

3.7572254335260116