# Wikihow GPT

In [None]:
import os
os.chdir("../")
os.getcwd()

In [3]:
import torch
# import tiktoken # Not used, smallest embedding table is too large for potato laptop compute
from torch.nn import functional as F
import torch.nn as nn

In [4]:
def get_sample(filepath, size=1024 *1024):
  with open(filepath, 'r') as f:
    sample = f.read(size)
  return sample

In [2]:
# hyperparameters
batch_size = 32
context_window = 128 # this is what karpathy calls block size in his code
max_iters = 6000 
elval_intervals = 300
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32 # aka d_model?? (vaswani 2017)


NameError: name 'torch' is not defined

In [None]:
# wikihow corpus
txt = get_sample('data/wikihow.txt')
print(len(txt)) # 621,684,876 characters
print(txt[:100])

# tokenization

trade-off: Vocab-size VS. token sequence length

`200k-4o` embedding vocab size of 200k is too large for my potato laptop

character-wise embeddings are manageable enough

In [None]:
char_set = sorted(set(txt)) # character-wise tokenization
vocab_size = len(char_set)
print(len(char_set)) # 95 characters
print(char_set[-20:])

# encoder and decoder mappings and functions
encode_mapping = {t: i for i, t in enumerate(sorted(set(txt)))}
decode_mapping = {v: k for k, v in encode_mapping.items()}

encode = lambda s: [encode_mapping[t] for t in s]
decode = lambda tok: ''.join([decode_mapping[t] for t in tok])

In [None]:
tokens = encode(txt)
print(len(tokens))
print(tokens[:100])

In [None]:
data = torch.tensor(data=tokens,dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

In [None]:
train_test_split_index = int(0.9*(len(data)))
print(train_test_split_index)
print(type(train_test_split_index))

In [10]:
train = data[:train_test_split_index]
test = data[train_test_split_index:]

In [None]:
batch_size = 8
context_window = 128 # this is what karpathy calls block size in his code
torch.manual_seed(42)

In [12]:
def get_batch(split):
  """Return a batch of x and y tensors of shape (batch_size, context_window)"""
  data = train if split == "train" else test
  ix = torch.randint(high=(len(data) - context_window), size=(batch_size,)) # get random indices for context sequences
  x = torch.stack([data[i:i+context_window] for i in ix])
  y = torch.stack([data[i+1:i+context_window+1] for i in ix])
  return x, y

In [13]:
xb, yb = get_batch("train")

In [None]:
print(xb.shape)
print(yb.shape)

## baseline model

Bigram

In [1]:
n_embd = 32

In [15]:
class BigramModel(nn.Module):
  def __init__(self):
    super().__init__()
    # each token simply picks the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # aka input embeddings (vaswani 2017) 
    self.positional_embedding_table = nn.Embedding(context_window, n_embd)
    # conversion from token embeddings to logits
    self.lm_head = nn.Linear(n_embd, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape

    token_embd = self.token_embedding_table(idx) # (B, T, C) # x tokens are used as indices of embedding table
    pos_embd = self.positional_embedding_table(torch.arange(T, device=device)) # (T, C)
    x = token_embd + pos_embd # (B, T, C)
    # query: what am i looking for
    # key: what do i contain

    logits = self.lm_head(x)  

    if targets is None:
      loss = None
    
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    
    return logits, loss
  
  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      # get predictions
      logits, loss = self.forward(idx)

      # last time step element contains the prediction
      logits = logits[:,-1, :] # becomes (B, C) shape

      # convert to probabilities
      probs = F.softmax(logits, dim=1) # (B, C)

      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

      # append sample to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T + 1)
    return idx


In [None]:
bigram = BigramModel(vocab_size=vocab_size)
logits, loss = bigram.forward(xb, yb)
print(logits.shape)
print(loss)

In [None]:
bigram.token_embedding_table.weight

In [None]:
idx = torch.zeros((1, 1), dtype=torch.long) # start with token 0 (i.e. \u character)
gen = bigram.generate(idx, max_new_tokens=100)
print(gen.shape) # for a batch of 100 token sequence, we get a 101 length sequence in return where 101-th element is the prediction
print(gen)

In [None]:
prediction = decode(gen[0].tolist())
print(prediction) # the model predicts garbage

In [None]:
for param in bigram.parameters():
    print(type(param))
    print(param.size())

In [34]:
# optimizer
optimizer = torch.optim.AdamW(bigram.parameters(), lr=1e-4)

In [None]:
# iterate over multiple batches
batch_size = 32 # use bigger batch size
steps = 10000

for _ in range(steps):

    # get a batch of x and y
    xb, yb = get_batch("train")

    # evaluate loss
    logits, loss = bigram(xb, yb)
    optimizer.zero_grad(set_to_none=True) # reset gradients from previous step to zero
    loss.backward() # calc gradients for all the parameters

    optimizer.step()

print(loss.item())

In [None]:
idx = torch.zeros((1, 1), dtype=torch.long) # start with token 0 (i.e. \u character)
gen = bigram.generate(idx, max_new_tokens=100)
prediction = decode(gen[0].tolist())
print(prediction) # the model predicts garbage