<a href="https://colab.research.google.com/github/samitha278/gpt-from-scratch/blob/main/GPT_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import gdown
import random

In [2]:
file_id = "1ia6z4itw7WJWpnoTohURX6Lm-AnZmVZz"
url = f"https://drive.google.com/uc?id={file_id}"

output = "input.txt"
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1ia6z4itw7WJWpnoTohURX6Lm-AnZmVZz
To: /content/input.txt
100%|██████████| 1.12M/1.12M [00:00<00:00, 121MB/s]


'input.txt'

In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

len(text)

1115394

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for i,s in enumerate(chars)}

encode = lambda s : [stoi[ch] for ch in s]
decode = lambda l : ''.join(itos[i] for i in l)

print(encode("Hello world"))
print(decode(encode("Hello world")))

[20, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
Hello world


In [6]:
data = torch.tensor(encode(text))
data[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [7]:
n = int(0.9* len(data))
train = data[:n]
val = data[n:]

In [8]:
block_size = 8

x = train[:block_size]
y = train[1:block_size+1]

for i in range(block_size):
  context = x[:i+1]
  target = y[i]

  print(f'{context} target:{target}')

tensor([18]) target:47
tensor([18, 47]) target:56
tensor([18, 47, 56]) target:57
tensor([18, 47, 56, 57]) target:58
tensor([18, 47, 56, 57, 58]) target:1
tensor([18, 47, 56, 57, 58,  1]) target:15
tensor([18, 47, 56, 57, 58,  1, 15]) target:47
tensor([18, 47, 56, 57, 58,  1, 15, 47]) target:58


In [18]:
ix = torch.randint(100-block_size, (4,))
ix

tensor([14, 41, 41, 14])

In [21]:
torch.manual_seed(278)


batch_size = 4
block_size = 8


def get_batch(split):

  data = train if split=='train' else val
  ix = torch.randint(len(data)-block_size, (batch_size,))
  x = torch.stack([data[i:block_size+i] for i in ix])
  y = torch.stack([data[i+1:block_size+i+1] for i in ix])
  return x,y


xb,yb = get_batch('train')

xb

tensor([[47, 57,  0, 42, 39, 59, 45, 46],
        [40, 59, 58,  1, 40, 39, 57, 58],
        [58, 53,  1, 46, 43, 56,  1, 57],
        [50, 63,  0, 58, 39, 56, 56, 63]])

## Bigram Language Model

In [29]:
torch.manual_seed(278)

class BigramLM:

  def __init__(self,vocab_size):

    self.token_emb_table = nn.Embedding(vocab_size,vocab_size)


  def __call__(self,idx , targets=None):

    logits = self.token_emb_table(idx)    #shape: (b,t,c)
    if targets is None:
      loss = None
    else:
      loss = F.cross_entropy(logits.view(-1,vocab_size) , targets.view(-1))

    return logits , loss


  def generate(self,idx, max_new_tokens):

    for _ in range(max_new_tokens):

      logits , loss = self(idx[:,-1])

      probs = F.softmax(logits,dim=1)

      ix = torch.multinomial(probs,num_samples=1)

      idx = torch.cat((idx,ix), dim=1)

    return idx



bigram = BigramLM(vocab_size)
logits , loss = bigram(xb,yb)

print(logits.shape,loss)



idx= torch.zeros((1,1),dtype= torch.long)

print(decode(bigram.generate(idx,max_new_tokens=100)[0].tolist()))







torch.Size([4, 8, 65]) tensor(4.5468, grad_fn=<NllLossBackward0>)

hDkMQcyOQpP-rU-,VfVk:rXwxj Ug$$kNRxr.x'R3ULl!WC?fErPF'K'nybrlziq:IF:J.-YVN.jj$R-kDwR
hWiDAg,rHH'!JzL
