<a href="https://colab.research.google.com/github/soumyadip1995/BabyGPT/blob/main/Notebook/lora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
words = open('/content/ALL_eminem.txt', 'r').read().split()
import torch
from torch import nn

chars = sorted(list(set(words)))
string2integer = {ch: i for i, ch in enumerate(chars)}
print(string2integer)

integer2string = {i:ch for ch,i in string2integer.items()}
encode = lambda s: [string2integer[c] for c in s]
print(encode)

decode = lambda l: ''.join([integer2string[i] for i in l])
print(decode)

data = torch.tensor(encode(words), dtype = torch.long)
print(data)
data.size()

vocab_size = len(chars)

<function <lambda> at 0x7d4b1ba20670>
<function <lambda> at 0x7d4b1ca163b0>
tensor([ 3865,  3286, 21378,  ..., 20148, 15394, 15426])


In [16]:
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
from math import sqrt



torch.manual_seed(1337)
class Attention(nn.Module):
  def __init__(self, embedded_dim, num_heads):
    super(Attention, self).__init__()

    self.atten = nn.Linear(embedded_dim, 3 * embedded_dim)
    self.projection = nn.Linear(embedded_dim, embedded_dim)
    self.num_heads = num_heads
    self.embedded_dim = embedded_dim
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))





  def forward(self, x):
    B,T,C = x.size()
    q, k ,v  = self.atten(x).split(self.embedded_dim, dim=2)
    q = q.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)
    k = k.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)
    v = v.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)



    # manual implementation of attention
    # from karpathy
    att = (q @ k.transpose(-2, -1)) * ((1.0 / math.sqrt(k.size(-1))))
    att = att.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    att = F.softmax(att, dim=-1)
    y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

    y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

    # output projection
    y = self.projection(y)
    return y

In [17]:
dropout = 0.2
class FeedForward(nn.Module):
  def __init__(self, embedded_dim):
    super(FeedForward, self).__init__()
    self.net = nn.Sequential(nn.Linear(embedded_dim, 4 * embedded_dim),
    nn.Linear(4 * embedded_dim, embedded_dim),
    nn.GELU(),
    nn.Dropout(dropout))

  def forward(self, x):
    return self.net(x)

In [18]:
### A simple Transformer Block
class Transformer(nn.Module):
  def __init__(self, embedded_dim, num_heads):
    super(Transformer, self).__init__()
    self.attention = Attention(embedded_dim, num_heads)
    self.feed_forward = FeedForward(embedded_dim)
    self.layer_norm_1 = nn.LayerNorm(embedded_dim)
    self.layer_norm_2 = nn.LayerNorm(embedded_dim)

  def forward(self, x):

    x = x + self.attention(self.layer_norm_1(x))
    x = x + self.feed_forward(self.layer_norm_2(x))
    return x

In [19]:
class BabyGPTmodel(nn.Module):
  def __init__(self, vocab_size, block_size, num_layers, embedded_dim, num_heads):
    super(BabyGPTmodel, self).__init__()
    self.token = nn.Embedding(vocab_size, embedded_dim)
    self.positional_embeddings = nn.Embedding(block_size, embedded_dim)
    self.layers1 = nn.ModuleList([Transformer(embedded_dim, num_heads) for _ in range(num_layers)])
    self.ln_f = nn.LayerNorm(embedded_dim, eps = 1e-12) # final layer
    self.ln_head = nn.Linear(embedded_dim, vocab_size)


    # init all weights
    ## from karpathy
    self.apply(self._init_weights)
    # apply special scaled init to the residual projections, per GPT-2 paper
    for pn, p in self.named_parameters():
      if pn.endswith('projection.weight'):
        torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * num_layers))

        # report number of parameters
        print("number of parameters: %d" % (sum(p.nelement() for p in self.parameters()),))

  def _init_weights(self, module):
      if isinstance(module, nn.Linear):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
          if module.bias is not None:
              torch.nn.init.zeros_(module.bias)
      elif isinstance(module, nn.Embedding):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self, idx):
    device = idx.device
    b, t = idx.size()
    tok_emb = self.token(idx)
    position_ids = torch.arange(0, t, dtype = torch.long).unsqueeze(0)
    pos_emb = self.positional_embeddings(position_ids)
    x = tok_emb + pos_emb
    for layers1 in self.layers1:
      x = layers1(x)
    x = self.ln_f(x)
    logits = self.ln_head(x[:, -1, :])
    return logits


In [20]:
vocab_size =   len(chars)
block_size = 4
embedded_dim = 16
num_heads = 4
num_layers = 4

gpt = BabyGPTmodel(vocab_size, block_size, num_layers, embedded_dim, num_heads)


number of parameters: 743407
number of parameters: 743407
number of parameters: 743407
number of parameters: 743407


A comparison between BabyGPT and Low rank adaptation. BabyGPT is b/w  743k parametres as it stands . Low rank adaptation improves the parametre efficiency around 361k parametres. (Almost half)

Note: Parametre size is also directly linked to context length.

In [6]:
input_dim = 16
rank = 4
W_A = nn.Parameter(torch.empty(input_dim, rank))
W_A.shape

torch.Size([16, 4])

In [21]:
import torch
import torch.nn as nn

class LowRankAttention(nn.Module):
    def __init__(self, dim, rank):
        super(LowRankAttention, self).__init__()
        self.rank = rank
        self.Wq = nn.Linear(dim, rank, bias=False)
        self.Wk = nn.Linear(dim, rank, bias=False)
        self.Wv = nn.Linear(dim, rank, bias=False)
        self.Wo = nn.Linear(rank, dim, bias=False)

    def forward(self, q, k, v):
        Q = self.Wq(q)
        K = self.Wk(k)
        V = self.Wv(v)

        # Compute the attention scores using low-rank approximation
        A = torch.bmm(Q, K.transpose(-2, -1)) / (self.rank ** 0.5)

        # Softmax along the key dimension
        A = torch.softmax(A, dim=-1)

        # Compute the attention-weighted values using low-rank approximation
        AV = torch.bmm(A, V)

        # Apply the output layer to the attention-weighted values
        out = self.Wo(AV)

        return out

class LowRankTransformerLayer(nn.Module):
    def __init__(self, dim, rank, dropout=0.2):
        super(LowRankTransformerLayer, self).__init__()
        self.attention = LowRankAttention(dim, rank)
        self.norm1 = nn.LayerNorm(dim)
        self.dropout1 = nn.Dropout(dropout)
        self.feedforward = nn.Sequential(
            nn.Linear(dim, dim * 3),
            nn.GELU(),
            nn.Linear(dim * 3, dim)
        )
        self.norm2 = nn.LayerNorm(dim)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        # Compute the self-attention layer
        attention_out = self.attention(x, x, x)

        # Add residual connection and normalize
        x = self.norm1(x + self.dropout1(attention_out))

        # Feed-forward layer
        ff_out = self.feedforward(x)

        # Add residual connection and normalize
        x = self.norm2(x + self.dropout2(ff_out))

        return x

class LowRankTransformer(nn.Module):
    def __init__(self, vocab_size, num_layers, dim, rank, num_heads, dropout= 0.2):
        super(LowRankTransformer, self).__init__()
        self.layers = nn.ModuleList([LowRankTransformerLayer(dim, rank, dropout) for _ in range(num_layers)])
        self.num_layers = num_layers
        self.dim = dim
        self.rank = rank
        self.num_heads = num_heads
        self.pos_embedding = nn.Embedding(vocab_size, dim)
        self.dropout = nn.Dropout(dropout)




        # init all weights
        ## from karpathy
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
          if pn.endswith('Wo.weight'):
            torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * num_layers))

            # report number of parameters
            print("number of parameters: %d" % (sum(p.nelement() for p in self.parameters()),))

    def _init_weights(self, module):
      if isinstance(module, nn.Linear):
        torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        if module.bias is not None:
          torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, x):
        # Add positional embeddings
        x = x + self.pos_embedding[:, :x.size(1)]

        # Apply dropout
        x = self.dropout(x)

        # Apply the transformer layers
        for layer in self.layers:
            x = layer(x)

        return x


In [24]:
# words = open(r"/content/ALL_eminem.txt", 'r', encoding='utf-8').read().split()

# chars = sorted(list(set(words)))
# vocab_size = len(chars)
lrt = LowRankTransformer(vocab_size, 4, 16, 4, 4)

number of parameters: 361712
number of parameters: 361712
number of parameters: 361712
number of parameters: 361712


### scaling laws

We check for the FLOPs for BabyGPT model

In [10]:
words = open('/content/ALL_eminem.txt', 'r').read().split()
import torch
from torch import nn

chars = sorted(list(set(words)))
string2integer = {ch: i for i, ch in enumerate(chars)}
print(string2integer)

integer2string = {i:ch for ch,i in string2integer.items()}
encode = lambda s: [string2integer[c] for c in s]
print(encode)

decode = lambda l: ''.join([integer2string[i] for i in l])
print(decode)

data = torch.tensor(encode(words), dtype = torch.long)
print(data)
data.size()

<function <lambda> at 0x7d4b1bb10280>
<function <lambda> at 0x7d4b1bb10430>
tensor([ 3865,  3286, 21378,  ..., 20148, 15394, 15426])


torch.Size([180194])

In [23]:
# block_size = 32
# batch_size = 128
# ix = torch.randint(len(data) - block_size, (batch_size,))
# x = torch.stack([data[i:i + block_size] for i in ix])
# x.shape

data.size()

torch.Size([180194])

In [None]:
# def gpt_params(seq_len, vocab_size, embedded_dim, num_heads, num_layers):
#     """ Given GPT config calculate total number of parameters """
#     ffw_size = 4*embedded_dim # in GPT the number of intermediate features is always 4*embedded_dim
#     # token and position embeddings
#     embeddings = embedded_dim * vocab_size + embedded_dim * seq_len
#     # transformer blocks
#     attention = 3*embedded_dim**2 + 3*embedded_dim # weights and biases
#     attproj = embedded_dim**2 + embedded_dim
#     ffw = embedded_dim*(ffw_size) + ffw_size
#     ffwproj = ffw_size*embedded_dim + embedded_dim
#     layernorms = 2*2*embedded_dim
#     # dense
#     ln_f = 2*embedded_dim
#     dense = embedded_dim*vocab_size # note: no bias here
#     # note: embeddings are not included in the param count!
#     total_params = num_layers*(attention + attproj + ffw + ffwproj + layernorms) + ln_f + dense
#     return total_params

# gpt2 = dict(seq_len = 1346, vocab_size = 478, embedded_dim = 16, num_heads = 4, num_layers = 4)
# gpt_params(**gpt2)

20800

In [11]:
def count_flops(seq_len, vocab_size, embedded_dim, num_heads, num_layers, ffw_size):
  key_size = embedded_dim // num_heads
  embeddings = 2 * seq_len * vocab_size * embedded_dim
  # attention

  attention = 2 * 3 * seq_len * embedded_dim * (key_size * num_heads)
  # key @ query logits
  attlogits = 2 * seq_len * seq_len * (key_size * num_heads)
  # softmax
  attsoftmax = 3 * num_heads * seq_len * seq_len # 3* is for subtract (max), exp, divide (?)
  # softmax @ value reductions
  attvalue = 2 * seq_len * seq_len * (key_size * num_heads)
  # final linear
  attlinear = 2 * seq_len * (key_size * num_heads) * embedded_dim
  att = attention + attlogits + attsoftmax + attvalue + attlinear
  # feed forward
  dense = 2 * seq_len * (embedded_dim * ffw_size + embedded_dim * ffw_size)

  # logits
  logits = 2 * seq_len * embedded_dim * vocab_size

  # this is what you'd expect:
  # forward_flops = embeddings + num_layers * (att + dense) + logits
  # but:
  # per author correspondence apparently there is typo in the paper,
  # they do not count embeddings and logits to repro table 4. So instead:
  forward_flops = num_layers * (att + dense)
  backward_flops = 2 * forward_flops # as in Kaplan et al. 2020
  total_flops = forward_flops + backward_flops

  return total_flops


class BabyGPTmodel(nn.Module):
  def __init__(self, vocab_size, block_size, num_layers, embedded_dim, num_heads):
    super(BabyGPTmodel, self).__init__()
    self.token = nn.Embedding(vocab_size, embedded_dim)
    self.positional_embeddings = nn.Embedding(block_size, embedded_dim)
    self.layers1 = nn.ModuleList([Transformer(embedded_dim, num_heads) for _ in range(num_layers)])
    self.ln_f = nn.LayerNorm(embedded_dim, eps = 1e-12) # final layer
    self.ln_head = nn.Linear(embedded_dim, vocab_size)


    # init all weights
    ## from karpathy
    self.apply(self._init_weights)
    # apply special scaled init to the residual projections, per GPT-2 paper
    for pn, p in self.named_parameters():
      if pn.endswith('projection.weight'):
        torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * num_layers))

        # report number of parameters
        print("number of parameters: %d" % (sum(p.nelement() for p in self.parameters()),))

  def _init_weights(self, module):
      if isinstance(module, nn.Linear):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
          if module.bias is not None:
              torch.nn.init.zeros_(module.bias)
      elif isinstance(module, nn.Embedding):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self, idx):
    device = idx.device
    b, t = idx.size()
    tok_emb = self.token(idx)
    position_ids = torch.arange(0, t, dtype = torch.long).unsqueeze(0)
    pos_emb = self.positional_embeddings(position_ids)
    x = tok_emb + pos_emb
    for layers1 in self.layers1:
      x = layers1(x)
    x = self.ln_f(x)
    logits = self.ln_head(x[:, -1, :])
    return logits

In [12]:
vocab_size =  len(chars)
block_size = 4
embedded_dim = 16
num_heads = 8
num_layers = 4

gpt = BabyGPTmodel(vocab_size, block_size, num_layers, embedded_dim, num_heads)



number of parameters: 743407
number of parameters: 743407
number of parameters: 743407
number of parameters: 743407


In [27]:
seq_len = data.size()
seq_len

torch.Size([180194])

In [32]:
vocab_size

22127

In [44]:

vocab_size = len(chars)
ffw_size = embedded_dim * 4
flops = count_flops(180194, 22127, 256, 4, 4, 64)
print( flops/1e15, "PFLOPS")


0.404940911721024 PFLOPS
