<a href="https://colab.research.google.com/github/pr1729p/deep_learning/blob/main/gpt_architecture_book_w_n_p.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

In [2]:
import torch.nn as nn

In [3]:
from torch.nn import functional as F

In [4]:
#loading the text
book_text = open('book-war-and-peace.txt', 'r', encoding = 'utf-8').read()

In [5]:
#few lines from the text
print(book_text[:500])

CHAPTER I

"Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by that
Antichrist--I really believe he is Antichrist--I will have nothing more
to do with you and you are no longer my friend, no longer my 'faithful
slave,' as you call yourself! But how do you do? I see I have frightened
you--sit down and tell me all the news."

It was in July, 180


In [6]:
#total number of characters in the text
print(len(book_text))

3202303


In [7]:
#number of distinct characters in the text
vocab = sorted(list(set(book_text)))
vocab_size  = len(vocab)
all_vocab = ''.join(vocab)
print(all_vocab)
print('vocab_size:', vocab_size)


 !"'()*,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzàäéê
vocab_size: 82


In [8]:
#dictionary to map char to int
char_to_int = {c:i for i,c in enumerate(vocab)}
int_to_char = {i:c for c,i in char_to_int.items()}
print('char_to_int:',char_to_int)
print('int_to_char:', int_to_char)

char_to_int: {'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, '*': 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, ';': 23, '=': 24, '?': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I': 34, 'J': 35, 'K': 36, 'L': 37, 'M': 38, 'N': 39, 'O': 40, 'P': 41, 'Q': 42, 'R': 43, 'S': 44, 'T': 45, 'U': 46, 'V': 47, 'W': 48, 'X': 49, 'Y': 50, 'Z': 51, 'a': 52, 'b': 53, 'c': 54, 'd': 55, 'e': 56, 'f': 57, 'g': 58, 'h': 59, 'i': 60, 'j': 61, 'k': 62, 'l': 63, 'm': 64, 'n': 65, 'o': 66, 'p': 67, 'q': 68, 'r': 69, 's': 70, 't': 71, 'u': 72, 'v': 73, 'w': 74, 'x': 75, 'y': 76, 'z': 77, 'à': 78, 'ä': 79, 'é': 80, 'ê': 81}
int_to_char: {0: '\n', 1: ' ', 2: '!', 3: '"', 4: "'", 5: '(', 6: ')', 7: '*', 8: ',', 9: '-', 10: '.', 11: '/', 12: '0', 13: '1', 14: '2', 15: '3', 16: '4', 17: '5', 18: '6', 19: '7', 20: '8', 21: '9', 22: ':', 23: ';', 24: '=', 25: '?', 26: 'A', 27: 'B', 

In [9]:
#defining an encode and decode lambda functions
encode = lambda s: [char_to_int[c] for c in s]
decode = lambda l: ''.join(int_to_char[i] for i in l)
print(encode("This is my code.\
It is for practice"))

[45, 59, 60, 70, 1, 60, 70, 1, 64, 76, 1, 54, 66, 55, 56, 10, 34, 71, 1, 60, 70, 1, 57, 66, 69, 1, 67, 69, 52, 54, 71, 60, 54, 56]


In [10]:
torch.manual_seed(1729)

<torch._C.Generator at 0x79ffb5998830>

In [11]:
#converting the entire text into encoded tensor
data = torch.tensor(encode(book_text), dtype = torch.long)

In [12]:
#dividing the text into training and validation dataset
n = int(0.85*len(data))
train_data = data[:n]
val_data = data[n:]

In [13]:
#hyperparameters -- all here

batch_size = 16     #--defines the number of batches which run parallely and independently
block_size = 64    # -- also known as max context length which will be fed into the transformer

learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # if gpu is available

max_iters = 3000
eval_iters = 200


n_embd = 64    # dimension of embeddings
n_head = 4     # number of heads in multihead attention
n_layer = 4
dropout_prob = 0

eval_interval = 100

In [14]:
#get_batch function-- randomly selects the batch_size of index and from each of these indices x contains the context and y contains their respectives targets

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data)- block_size,(batch_size,))
  x = torch.stack([data[i: i+ block_size] for i in ix])
  y = torch.stack([data[i+1: i+ block_size +1] for i in ix])
  x,y = x.to(device), y.to(device)
  return x,y


In [15]:
#estimating loss
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train','val']:
    losses = torch.zeros(eval_iters)
    for x in range(eval_iters):
      X,Y = get_batch(split)
      logits, loss = model(X,Y)
      losses[x] = loss.item()
    out[split] = losses.mean()

  model.train()
  return out

In [16]:
#self attention with single head
class Head(nn.Module):
  def __init__(self,head_size):
    super().__init__()
    # key, value, query
    self.key = nn.Linear(n_embd, head_size,bias = False)
    self.query = nn.Linear(n_embd, head_size,bias = False)
    self.value = nn.Linear(n_embd, head_size,bias = False)

    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout_prob)


  def forward(self,x):
    B,T,C = x.shape
    k = self.key(x)     #B,T,C-- head_size
    q = self.query(x)    #B,T,C-- head_size
    wgt = q @ k.transpose(-2,-1) * C**-0.5     #B,T,head_size  @ B,head_size,T ---> B, T, T
    wgt = wgt.masked_fill(self.tril[:T, :T] == 0, float('-inf'))    # masking the future tokens -- auto regressive
    wgt = F.softmax(wgt, dim =-1)

    wgt = self.dropout(wgt)
    v = self.value(x)
    out = wgt @ v  #B,T,T @ B,T,C--> B,T,C
    return out

In [17]:
#Multihead attention

class MultiHeadAttention(nn.Module):
  def __init__(self,num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout_prob)

  def forward(self,x):
    out = torch.cat([p(x) for p in self.heads], dim = -1)
    out = self.dropout(self.proj(out))
    return out

In [18]:
class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4*n_embd),
        nn.ReLU(),
        nn.Linear(4*n_embd, n_embd),
        nn.Dropout(dropout_prob),
    )

  def forward(self,x):
    return self.net(x)


In [19]:
#defining block of a transformer
class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)  #Layer Normalization
    self.ln2 = nn.LayerNorm(n_embd)


  def forward(self,x):
    x = x+ self.sa(self.ln1(x))   # residual layer
    x = x + self.ffwd(self.ln2(x))
    return x


In [20]:

class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size,n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head = n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)



  def forward(self, idx, targets = None):
    B,T = idx.shape
    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device = device))

    x= tok_emb +pos_emb
    x = self.blocks(x)
    x = self.ln_f(x)

    logits = self.lm_head(x)
    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)


    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]

      logits, loss = self(idx_cond)
      logits = logits[:,-1,:]
      probs = F.softmax(logits, dim = -1)
      idx_next = torch.multinomial(probs, num_samples = 1)
      idx = torch.cat((idx, idx_next), dim = 1)

    return idx


In [21]:
model = BigramLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')


optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)


for iter in range(max_iters):

  if iter % eval_interval == 0 or iter == max_iters -1:
    losses = estimate_loss()
    print(f"step {iter} : train loss {losses['train']: .4f}, val loss {losses['val']:.4f}")

  xb,yb = get_batch('train')
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none = True)
  loss.backward()
  optimizer.step()

context = torch.zeros((1,1), dtype = torch.long, device = device)


0.21397 M parameters
step 0 : train loss  4.5884, val loss 4.5849
step 100 : train loss  2.6250, val loss 2.6099
step 200 : train loss  2.5131, val loss 2.5121
step 300 : train loss  2.4500, val loss 2.4497
step 400 : train loss  2.4015, val loss 2.4053
step 500 : train loss  2.3504, val loss 2.3485
step 600 : train loss  2.2842, val loss 2.2882
step 700 : train loss  2.2320, val loss 2.2295
step 800 : train loss  2.1649, val loss 2.1756
step 900 : train loss  2.1167, val loss 2.1293
step 1000 : train loss  2.0640, val loss 2.0757
step 1100 : train loss  2.0281, val loss 2.0364
step 1200 : train loss  1.9923, val loss 2.0066
step 1300 : train loss  1.9581, val loss 1.9723
step 1400 : train loss  1.9428, val loss 1.9453
step 1500 : train loss  1.9057, val loss 1.9184
step 1600 : train loss  1.8923, val loss 1.9048
step 1700 : train loss  1.8708, val loss 1.8729
step 1800 : train loss  1.8412, val loss 1.8610
step 1900 : train loss  1.8302, val loss 1.8440
step 2000 : train loss  1.8114,

In [22]:
print(decode(m.generate(context, max_new_tokens = 2000)[0].tolist()))



Bmody unpaity:
"That yesit juests. She vidorly down thefe been
mastily you, to offiired that he prove to joung with the lat fined serl
teneration, why I that etyening, over why kremoy is the stubed so fix with came
will a take a not unnd, preased) the gots.


The warcives estainabby the Emperor bowing-losed."

"That opaiblity, the glesst. Prince Andrew avanyk imabout the most
colyorser' leaing trettler of but the Emperousa husboken, ageaut having
finfingshts will feen of the I penouse to she immant unearms to
lifted in his dotomember's after. Vere up bevilly with a underroward
lesterer began to peace, but Nare at aligholed nove infeersmiong
to peopliby in that the ongates.

"Don'ter his exchider.

"Then, pattened Bogral shat the eman dry suriausk the Frenchina,
Prince Verited hearfuard. Lelay, but assualtion's in smile heads
in the heasts his shorcound regurg at reard cacchs, and twall all impartor and horen
talkenlyt of came esto wark. Gening on it in whut lakeing of the
Blanzagress 