<a href="https://colab.research.google.com/github/soumyadip1995/language-models/blob/main/Notebook/BabyGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import torch
from torch import nn
import math
import numpy as np 
import torch.nn.functional  as F
from math import sqrt
import torch.nn as nn 



words = open(r"/content/text.txt", 'r' , encoding='utf-8').read().split()
# words[:20]


chars = sorted(list(set(words)))
string2integer = {ch: i for i, ch in enumerate(chars)}
# print(string2integer)

integer2string = {i:ch for ch,i in string2integer.items()}
encode = lambda s: [string2integer[c] for c in s]
# print(encode)

decode = lambda l: ''.join([integer2string[i] for i in l])
# print(decode)

data = torch.tensor(encode(words), dtype = torch.long)
# print(data)
# data.size()

## block_size and batch size has been changed from 64 and 512 to 32 and 128
block_size = 16
batch_size = 32
ix = torch.randint(len(data) - block_size, (batch_size,))

## hidden dimensionality has been changed from 512 to 128.

vocab_size = len(chars)
d_k = 32
token_emb = nn.Embedding(vocab_size, d_k)


x = torch.stack([data[i:i + block_size] for i in ix])
input_embeds = token_emb(x)
# input_embeds.size()


def scaled_dot_product(query, key, value):
  dim_k = query.size(-1)
  scores = torch.bmm(query, key.transpose(-2, -1)) / sqrt(dim_k)
  weights = F.softmax(scores, dim = -1)
  return torch.bmm(weights, value)

key = input_embeds
query = input_embeds
value = input_embeds

# sdp = scaled_dot_product(query, key, value)
# print(sdp.size())

### Multi headed attention

"""Having many heads allows the model to focus on different parts of the sentences. 
The softmax on one head tends to focus on one aspect of similarity. For example subject verb interaction."""
## A single attention head

class AttentionHead(nn.Module):
  def __init__(self, embedded_dim, head_dim):
    super().__init__()
    self.q = nn.Linear(embedded_dim, head_dim)
    self.k = nn.Linear(embedded_dim,  head_dim)
    self.v = nn.Linear(embedded_dim,  head_dim)

  def forward(self, x):
    attention_outputs = scaled_dot_product(self.q(x), self.k(x), self.v(x))
    

    return attention_outputs

# embedding_dim = embedding dimensions
# num_heads  = number of heads 


class MultiHeadAttention(nn.Module):
  def __init__(self, embedded_dim, num_heads):
    super().__init__()
    self.embedded_dim = embedded_dim
    self.num_heads = num_heads
    head_dim = embedded_dim // num_heads 

    self.heads = nn.ModuleList([AttentionHead(embedded_dim, head_dim) for _ in range(num_heads)])
    self.output_linear = nn.Linear(embedded_dim, embedded_dim)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim = -1)
    
    out = self.output_linear(out)

    return out

# multihead_attention = MultiHeadAttention(128, 8)
# # multihead_attention

# attention_outputs =  multihead_attention(input_embeds)
# # print(attention_outputs.size())


# from karpathy , partially
dropout = 0.2

class FeedForward(nn.Module):
  def __init__(self, embedded_dim):
    super(FeedForward, self).__init__()
    self.net = nn.Sequential(nn.Linear(embedded_dim, 4 * embedded_dim),
    nn.Linear(4 * embedded_dim, embedded_dim),
    nn.GELU(),
    nn.Dropout(dropout))

  def forward(self, x):
    return self.net(x)


### A simple Transformer Block    
class TransformerBlock(nn.Module):
  def __init__(self, embedded_dim, num_heads):
    super(TransformerBlock, self).__init__()
    self.attention = MultiHeadAttention(embedded_dim,  num_heads)
    self.feed_forward = FeedForward(embedded_dim)
    self.layer_norm_1 = nn.LayerNorm(embedded_dim)
    self.layer_norm_2 = nn.LayerNorm(embedded_dim)

  def forward(self, x):
    
    x = x + self.attention(self.layer_norm_1(x))
    x = x + self.feed_forward(self.layer_norm_2(x))
    return x




In [10]:

class BabyGPTmodel(nn.Module):
  def __init__(self, vocab_size, block_size, num_layers, embedded_dim, num_heads, d_k):
    super(BabyGPTmodel, self).__init__()
    self.token = nn.Embedding(vocab_size, d_k)
    self.positional_embeddings = nn.Embedding(block_size, d_k)
    self.layers1 = nn.ModuleList([TransformerBlock(embedded_dim, num_heads) for _ in range(num_heads)])
    self.ln_f = nn.LayerNorm(embedded_dim, eps = 1e-12) # final layer 
    self.ln_head = nn.Linear(embedded_dim, vocab_size)


    # init all weights
    ## from karpathy
    self.apply(self._init_weights)
    # apply special scaled init to the residual projections, per GPT-2 paper
    for pn, p in self.named_parameters():
      if pn.endswith('q.weight'):
        torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * num_layers))

        # report number of parameters
        print("number of parameters: %d" % (sum(p.nelement() for p in self.parameters()),))

  def _init_weights(self, module):
      if isinstance(module, nn.Linear):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
          if module.bias is not None:
              torch.nn.init.zeros_(module.bias)
      elif isinstance(module, nn.Embedding):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self, idx):
    device = idx.device
    b, t = idx.size()
    tok_emb = self.token(x)
    position_ids = torch.arange(x.size(-1), dtype = torch.long).unsqueeze(0)
    pos_emb = self.positional_embeddings(position_ids)
    x = tok_emb + pos_emb
    for layers1 in self.layers1:
      x = layers1(x)
      x = self.ln_f(x)
    logits = self.ln_head(x)
    return logits

In [None]:
## number of parameters: 117,187
num_layers = 3
gpt = BabyGPTmodel(num_layers, vocab_size, block_size, 32, 8, 32)
# d = gpt(x)
# d.size()

In [1]:
import math
import torch
import torch.nn as nn
from torch.nn import functional as F 
from math import sqrt



torch.manual_seed(1337)
class Attention(nn.Module):
  def __init__(self, embedded_dim, num_heads):
    super(Attention, self).__init__()
    self.atten = nn.Linear(embedded_dim, 3 * embedded_dim)
    self.projection = nn.Linear(embedded_dim, embedded_dim)
    self.num_heads = num_heads
    self.embedded_dim = embedded_dim
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.size()
    q, k ,v  = self.atten(x).split(self.embedded_dim, dim=2)
    q = q.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)
    k = k.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)
    v = v.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)


    # manual implementation of attention
    # from karpathy
    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
    att = att.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    att = F.softmax(att, dim=-1)
    y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

    # output projection
    y = self.projection(y)
    return y

In [2]:
# x = torch.randn(3, 3, 16)

# att = Attention(16, 8)
# tt = att(x)
# tt.size()

In [2]:
dropout = 0.2
class FeedForward(nn.Module):
  def __init__(self, embedded_dim):
    super(FeedForward, self).__init__()
    self.net = nn.Sequential(nn.Linear(embedded_dim, 4 * embedded_dim),
    nn.Linear(4 * embedded_dim, embedded_dim),
    nn.GELU(),
    nn.Dropout(dropout))

  def forward(self, x):
    return self.net(x)


In [4]:
# feed_forward = FeedForward(embedded_dim = 16)
# # feed_forward
# ff_outputs = feed_forward(tt)
# ff_outputs.size()

torch.Size([3, 3, 16])

In [3]:
### A simple Transformer Block    
class Transformer(nn.Module):
  def __init__(self, embedded_dim, num_heads):
    super(Transformer, self).__init__()
    self.attention = Attention(embedded_dim,  num_heads)
    self.feed_forward = FeedForward(embedded_dim)
    self.layer_norm_1 = nn.LayerNorm(embedded_dim)
    self.layer_norm_2 = nn.LayerNorm(embedded_dim)

  def forward(self, x):
    
    x = x + self.attention(self.layer_norm_1(x))
    x = x + self.feed_forward(self.layer_norm_2(x))
    return x

In [6]:
# btt = Transformer(16, 8)
# to = btt(x)
# print(to.size())


torch.Size([3, 3, 16])


In [4]:

class BabyGPTmodel(nn.Module):
  def __init__(self, vocab_size, block_size, num_layers, embedded_dim, num_heads):
    super(BabyGPTmodel, self).__init__()
    self.token = nn.Embedding(vocab_size, embedded_dim)
    self.positional_embeddings = nn.Embedding(block_size, embedded_dim)
    self.layers1 = nn.ModuleList([Transformer(embedded_dim, num_heads) for _ in range(num_heads)])
    self.ln_f = nn.LayerNorm(embedded_dim, eps = 1e-12) # final layer 
    self.ln_head = nn.Linear(embedded_dim, vocab_size)


    # init all weights
    ## from karpathy
    self.apply(self._init_weights)
    # apply special scaled init to the residual projections, per GPT-2 paper
    for pn, p in self.named_parameters():
      if pn.endswith('projection.weight'):
        torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * num_layers))

        # report number of parameters
        print("number of parameters: %d" % (sum(p.nelement() for p in self.parameters()),))

  def _init_weights(self, module):
      if isinstance(module, nn.Linear):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
          if module.bias is not None:
              torch.nn.init.zeros_(module.bias)
      elif isinstance(module, nn.Embedding):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self, idx):
    device = idx.device
    b, t = idx.size()
    tok_emb = self.token(idx)
    position_ids = torch.arange(0, t, dtype = torch.long).unsqueeze(0)
    pos_emb = self.positional_embeddings(position_ids)
    x = tok_emb + pos_emb
    for layers1 in self.layers1:
      x = layers1(x)
    x = self.ln_f(x)
    logits = self.ln_head(x[:, -1, :])
    return logits

In [27]:
# btt = Transformer(16, 8)
# to = btt(x)
# print(to.size())



torch.Size([3, 3, 16])


In [6]:
# vocab_size =  4
# block_size = 4
# embedded_dim = 16
# num_heads = 4
# num_layers = 4
# gpt = BabyGPTmodel(vocab_size, block_size, num_layers, embedded_dim, num_heads)
# # number of parameters: 13,315

number of parameters: 13315
number of parameters: 13315
number of parameters: 13315
number of parameters: 13315


In [None]:
# seq = list(map(int, "1111011110111101101"))
# seq

In [None]:
## from karpathy
# # convert the sequence to a tensor holding all the individual examples in that sequence
# X, Y = [], []
# # iterate over the sequence and grab every consecutive 3 bits
# # the correct label for what's next is the next bit at each position
# for i in range(len(seq) - block_size):
#     X.append(seq[i:i+block_size])
#     Y.append(seq[i+block_size])
#     print(f"example {i+1:2d}: {X[-1]} --> {Y[-1]}")
# X = torch.tensor(X,  dtype=torch.long)
# Y = torch.tensor(Y,  dtype=torch.long)
# # print(X.shape, Y.shape)
# # print(X.size(), Y.size())

# print(X, Y)

In [9]:
# optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3, weight_decay=1e-1)

In [None]:
# for i in range(100):
#     logits = gpt(X)
#     loss = F.cross_entropy(logits, Y)
#     loss.backward()
#     optimizer.step()
#     optimizer.zero_grad()
#     print(i, loss.item())

In [5]:
words = open(r"/content/text.txt", 'r' , encoding='utf-8').read().split()

chars = sorted(list(set(words)))
string2integer = {ch: i for i, ch in enumerate(chars)}


integer2string = {i:ch for ch,i in string2integer.items()}
encode = lambda s: [string2integer[c] for c in s]

decode = lambda l: ''.join([integer2string[i] for i in l])
data = torch.tensor(encode(words), dtype = torch.long)


In [6]:
batch_size = 16
block_size = 4
embedded_dim = 16
num_heads = 4
num_layers = 4

# generate a small batch of data of inputs x and targets y

ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+block_size] for i in ix])
print((x, y))



(tensor([[ 44, 373,  29, 194],
        [253, 228, 321, 387],
        [ 98, 193, 475, 118],
        [451, 285, 238,  20],
        [ 52, 185, 470, 358],
        [318, 308, 105,  97],
        [281, 101,  37, 418],
        [ 29, 467, 244, 312],
        [470,   4, 470, 403],
        [ 75, 256, 407, 389],
        [445, 226,  29, 256],
        [339, 136, 436, 295],
        [295, 131, 295, 323],
        [308, 105,  97,  84],
        [106, 382, 288,  91],
        [206, 249,   4,  29]]), tensor([234, 110,  29, 470, 241,  84, 133,  76, 244, 122, 470, 459,  91,  65,
        393, 399]))


In [7]:
vocab_size = len(chars)
block_size = 4
embedded_dim = 16
num_heads = 4
num_layers = 4

gpt = BabyGPTmodel(vocab_size, block_size, num_layers, embedded_dim, num_heads)
## number of parameters: 28,990

number of parameters: 28990
number of parameters: 28990
number of parameters: 28990
number of parameters: 28990


In [18]:
# n = int(0.9*len(data)) # first 90% will be train, rest val
# train_data = data[:n]
# val_data = data[n:]

In [27]:
# d = torch.tensor(train_data.shape, dtype = torch.long)
# p = torch.tensor(val_data.shape, dtype = torch.long)
# p

tensor([135])

In [None]:
# # batch_size = 32
# input = torch.randn(d.shape, embedded_dim)
# target = torch.tensor(p.shape)
# print(input.shape, target.shape)

In [8]:
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3, weight_decay=1e-1)

In [9]:
for i in range(1000):
    logits = gpt(x)
    loss = F.cross_entropy(logits, y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(i, loss.item())

0 6.1647467613220215
1 6.0855512619018555
2 6.0388503074646
3 6.0027546882629395
4 5.960081577301025
5 5.9288225173950195
6 5.896416187286377
7 5.863935470581055
8 5.842287540435791
9 5.814393520355225
10 5.78857421875
11 5.760965347290039
12 5.739848613739014
13 5.711612701416016
14 5.6829304695129395
15 5.663541793823242
16 5.634166717529297
17 5.610829830169678
18 5.582950592041016
19 5.55332088470459
20 5.525705337524414
21 5.5001301765441895
22 5.474945545196533
23 5.444350719451904
24 5.414790153503418
25 5.38487434387207
26 5.357189655303955
27 5.328928470611572
28 5.299066066741943
29 5.267841815948486
30 5.234304428100586
31 5.203612327575684
32 5.1714630126953125
33 5.140346527099609
34 5.107305526733398
35 5.0787858963012695
36 5.044053554534912
37 5.011567115783691
38 4.977425575256348
39 4.942967891693115
40 4.910092830657959
41 4.8771162033081055
42 4.8441596031188965
43 4.807589054107666
44 4.772795677185059
45 4.738954544067383
46 4.704535961151123
47 4.668580055236816
