<a href="https://colab.research.google.com/github/kotharisanjana/CMPE297_SpecialTopics_Fall2023/blob/main/Assignment_2/NanoGPT_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
torch.manual_seed(101)

<torch._C.Generator at 0x7db15d9cfb70>

In [4]:
batch_size = 16
block_size = 32
max_iters = 5000
eval_interval = 1000
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 500
n_embd = 64
n_head = 8
n_layer = 8

In [5]:
input_file_path = "/content/drive/MyDrive/CMPE297/burining_whales.txt"

In [6]:
class Dataset:
  def __init__(self):
    self.vocab_size = 0
    self.train_data = torch.tensor([])
    self.val_data = torch.tensor([])

  def read_dataset(self):
    with open(input_file_path, 'r', encoding='utf-8') as f:
        self.data = f.read()

  def prepare_dataset(self):
    chars = sorted(list(set(self.data)))
    self.vocab_size = len(chars)
    char_to_int = { ch:i for i,ch in enumerate(chars) }
    int_to_char = { i:ch for i,ch in enumerate(chars) }
    self.encode = lambda s: [char_to_int[c] for c in s]
    self.decode = lambda l: ''.join([int_to_char[i] for i in l])

  def data_split(self):
    data_tensor = torch.tensor(self.encode(self.data), dtype=torch.long)
    n = int(0.8*len(data_tensor))
    self.train_data = data_tensor[:n]
    self.val_data = data_tensor[n:]

  def get_batch(self, split):
    data = self.train_data if split == 'train' else self.val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [7]:
datasetObj = Dataset()
datasetObj.read_dataset()
datasetObj.prepare_dataset()
datasetObj.data_split()

In [8]:
class Loss:
  @torch.no_grad()
  def estimate_loss(self):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = datasetObj.get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [9]:
lossObj = Loss()

In [10]:
class AttentionHead(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)
    w = q @ k.transpose(-2,-1) * C**-0.5
    w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    w = F.softmax(w, dim=-1)

    v = self.value(x)
    out = w @ v

    return out

In [11]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out

In [12]:
class FeedFoward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd))

  def forward(self, x):
    return self.net(x)

In [13]:
class TransformerBlock(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedFoward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

In [14]:
class NanoGPT(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(datasetObj.vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, datasetObj.vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    tok_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_emb + pos_emb
    x = self.blocks(x)
    x = self.ln_f(x)
    logits = self.lm_head(x)

    if targets is None:
        loss = None
    else:
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [15]:
def generateNext():
  context = torch.zeros((1, 1), dtype=torch.long, device=device)
  print(datasetObj.decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

In [16]:
if __name__ == '__main__':
  model = NanoGPT()
  m = model.to(device)
  print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

  for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = lossObj.estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = datasetObj.get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0.405027 M parameters
step 0: train loss 3.7954, val loss 3.7279
step 1000: train loss 0.1124, val loss 5.6521
step 2000: train loss 0.1071, val loss 5.9051
step 3000: train loss 0.0980, val loss 6.1296
step 4000: train loss 0.0999, val loss 6.3486
step 4999: train loss 0.0969, val loss 6.4930


In [17]:
generateNext()


If we were but fire we would not mind mither.

But we are also water, even one drop of which forms a tear.

Those who were there that day, who lit the match and beheld the forld ming wh the ithe the corpses.

If we were but fire we would not mind either.

But we are also water, even one drop of which forms a tear.

Those who were there that day, who lit the match and beheld the f ich ared bed whith baby sleeping.

So it was with the whales beached in Oregon and the not knowing what else to do with the corpses.

If we were but fire we would not mind either.

But we are also water, even one drop of which forms a tear.

Those who were there that day, who lit the match and beheld the forlich f a's tes.

If we were but fire we would not mind either.

But we are also water, even one drop of which forms a tear.

Those who were there that day, who lit the match and beheld the f ich ared sor.

Those who were there that day, who lit the match and beheld the fores mich g.

Th we whale there in O