<a href="https://colab.research.google.com/github/pr1729p/deep_learning/blob/main/gpt_architecture_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

In [2]:
import torch.nn as nn

In [3]:
from torch.nn import functional as F

In [4]:
#hyperparameters

batch_size = 8
block_size = 64
max_iters = 4000
#eval_iters = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'

eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout_prob = 0.1

eval_interval = 100

In [5]:
torch.manual_seed(1729)

<torch._C.Generator at 0x7863c83856f0>

In [6]:
english_text = open('input.txt', 'r', encoding = 'utf-8').read()

In [7]:
print(english_text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [8]:
print(len(english_text))

1115394


In [9]:
vocab = sorted(list(set(english_text)))
vocab_size  = len(vocab)
all_vocab = ''.join(vocab)
print(all_vocab)
print('vocab_size:', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab_size: 65


In [10]:
char_to_int = {c:i for i,c in enumerate(vocab)}
int_to_char = {i:c for c,i in char_to_int.items()}
print('char_to_int:',char_to_int)
print('int_to_char:', int_to_char)

char_to_int: {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
int_to_char: {0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 

In [11]:
encode = lambda s: [char_to_int[c] for c in s]
decode = lambda l: ''.join(int_to_char[i] for i in l)
print(encode("This is my code.\
It is for practice"))

[32, 46, 47, 57, 1, 47, 57, 1, 51, 63, 1, 41, 53, 42, 43, 8, 21, 58, 1, 47, 57, 1, 44, 53, 56, 1, 54, 56, 39, 41, 58, 47, 41, 43]


In [12]:
data = torch.tensor(encode(english_text), dtype = torch.long)

In [13]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [14]:
def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data)- block_size,(batch_size,))
  x = torch.stack([data[i: i+ block_size] for i in ix])
  y = torch.stack([data[i+1: i+ block_size +1] for i in ix])
  x,y = x.to(device), y.to(device)
  return x,y


In [15]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train','val']:
    losses = torch.zeros(eval_iters)
    for x in range(eval_iters):
      X,Y = get_batch(split)
      logits, loss = model(X,Y)
      losses[x] = loss.item()
    out[split] = losses.mean()

  model.train()
  return out

In [16]:
#self attention with single head
class Head(nn.Module):
  def __init__(self,head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size,bias = False)
    self.query = nn.Linear(n_embd, head_size,bias = False)
    self.value = nn.Linear(n_embd, head_size,bias = False)

    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout_prob)


  def forward(self,x):
    B,T,C = x.shape
    k = self.key(x)     #B,T,C
    q = self.query(x)
    wgt = q @ k.transpose(-2,-1) * C**-0.5 #B,T,head_size  @ B,head_size,T ---> B, T, T
    wgt = wgt.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wgt = F.softmax(wgt, dim =-1)

    wgt = self.dropout(wgt)
    v = self.value(x)
    out = wgt @ v  #B,T,T @ B,T,C--> B,T,C
    return out

In [17]:
class MultiHeadAttention(nn.Module):
  def __init__(self,num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout_prob)

  def forward(self,x):
    out = torch.cat([p(x) for p in self.heads], dim = -1)
    out = self.dropout(self.proj(out))
    return out



In [18]:
class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4*n_embd),
        nn.ReLU(),
        nn.Linear(4*n_embd, n_embd),
        nn.Dropout(dropout_prob),
    )

  def forward(self,x):
    return self.net(x)


In [19]:
class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)


  def forward(self,x):
    x = x+ self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x


In [20]:
class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size,n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head = n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)



  def forward(self, idx, targets = None):
    B,T = idx.shape
    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device = device))

    x= tok_emb +pos_emb
    x = self.blocks(x)
    x = self.ln_f(x)

    logits = self.lm_head(x)
    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)


    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]

      logits, loss = self(idx_cond)
      logits = logits[:,-1,:]
      probs = F.softmax(logits, dim = -1)
      idx_next = torch.multinomial(probs, num_samples = 1)
      idx = torch.cat((idx, idx_next), dim = 1)

    return idx


In [21]:
model = BigramLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')


optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)


for iter in range(max_iters):

  if iter % eval_interval == 0 or iter == max_iters -1:
    losses = estimate_loss()
    print(f"step {iter} : train loss {losses['train']: .4f}, val loss {losses['val']:.4f}")

  xb,yb = get_batch('train')
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none = True)
  loss.backward()
  optimizer.step()

context = torch.zeros((1,1), dtype = torch.long, device = device)


0.211777 M parameters
step 0 : train loss  4.3013, val loss 4.2990
step 100 : train loss  2.7109, val loss 2.7189
step 200 : train loss  2.5635, val loss 2.5770
step 300 : train loss  2.5129, val loss 2.5126
step 400 : train loss  2.4700, val loss 2.4769
step 500 : train loss  2.4413, val loss 2.4353
step 600 : train loss  2.3928, val loss 2.4121
step 700 : train loss  2.3691, val loss 2.3769
step 800 : train loss  2.3329, val loss 2.3405
step 900 : train loss  2.3021, val loss 2.3247
step 1000 : train loss  2.2665, val loss 2.2802
step 1100 : train loss  2.2427, val loss 2.2621
step 1200 : train loss  2.2208, val loss 2.2379
step 1300 : train loss  2.1943, val loss 2.2093
step 1400 : train loss  2.1675, val loss 2.1900
step 1500 : train loss  2.1504, val loss 2.1702
step 1600 : train loss  2.1197, val loss 2.1471
step 1700 : train loss  2.1104, val loss 2.1411
step 1800 : train loss  2.0853, val loss 2.1202
step 1900 : train loss  2.0708, val loss 2.1103
step 2000 : train loss  2.0459

In [22]:
print(decode(m.generate(context, max_new_tokens = 2000)[0].tolist()))



As do? last I mosmbood erroges, at in houraln.

WESSCEOF VINE:
And word well you trost not.

GLord RICHARDS:
Your Bownam, unt my const all thestere lork.

LLIZUCHENR:
No!

GoLienn beath sord then; the if prustarn,
You Mat, tha not your stelsan: tay be fortilly:
It the my so:
Will
Dull I theye batarry iff, I freselage,
Tise to for your aind as the
tome of to a beald good hampreas take ther waves,
To suck of to and lofteas to lettrave.

BRICHEM:
All vile but thonguly, gome.

BREWINCE:
Ay resos! Ralewear, im.

Googe:

The che cofondilt:
Well sho fose cuttar on our dle;
Go to then delf hand hoar consat you nos.

MENRY OF IAHGin VI:
Thle love, sie card! We a seire to to ure!
The not kinay, that'y how! this ongake youm lord.

Hy Loo, sorr:
Jull then that not sicherx.

KING' RICHARK:
Go, lett s, some theret say:
Kad pore sundeed: to Risent, mad, I thoult art,
Wand your whiph go?

DUTET:
What there, its suck, not.

CLAUMEREA:
But you go, ithy beee, froce stry of yoarr to bestateur, by,
Thy, g