In [53]:
# I like to print funny stuff
print("Luke, I am your father")

Luke, I am your father


OKay, let's start with simple bigram model which Andrej Karpathy developed in his make more series, but with minimum help from his video. Some help from Github Copilot. Let's see :)

In [54]:
# Let's open a webpage, download the text file and read it
import urllib.request
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = urllib.request.urlopen(url)
data_in = response.read().decode('utf8')

# print first 1000 characters of the raw text
# print(data_in[:1000])

In [55]:
# Okay, let us encode the text into a list of integers
# We will use a dictionary to map each character to a unique integer
# We will also create a reverse dictionary to map each integer back to a character

# First, let us create a list of all the unique characters in the text and sort them
chars = sorted(list(set(data_in)))
#print(chars)

# the vocab size is the number of unique characters
vocab_size = len(chars)

# create a mapping to map each character to a unique integer
chtoi = {c: i for i, c in enumerate(chars)}
itoch = {i: c for i, c in enumerate(chars)}

# create an encode function to encode the text into a list of integers ( took help from Andrej's video)
def encode(text):
    return [chtoi[c] for c in text]

# create a decode function to decode the list of integers into text
def decode(text):
    return ''.join([itoch[i] for i in text])

#print(chtoi)
#print(itoch)

In [56]:
# okay, now let us encode the text into a list of integers
data_en = encode(data_in)

# print the first 1000 integers
#print(data_en[:1000])

In [57]:
# Okay, time to start training
# first, put the data into a torch tensor
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import numpy as np

torch.manual_seed(1337)

# create a tensor of the data
data = torch.tensor(data_en)

# split the data into training and validation sets, 90% training and 10% validation
train_data = data[:int(len(data)*0.9)]
val_data = data[int(len(data)*0.9):]


# # let us define some hyperparameters
# batch_size = 2**8 # number of sequences in a batch
# token_size = 3 # number of characters in a sequence
# emd_size = 10 # size of the embedding
# max_iters = 100 # number of iterations to train for
# lr = 0.001 # learning rate


In [58]:
# create a function to generate a batch of data
def get_batch(data, batch_size, token_size, generator=False):
    # the length of the data
    data_len = len(data)
    
    # create a random starting point for each batch
    if generator:
        start = torch.randint(0, data_len - token_size-1, (batch_size,), generator=generator)
    else:
        start = torch.randint(0, data_len - token_size-1, (batch_size,))
    
    # create a list of sequences
    sequences = [data[i:i+token_size] for i in start]
    
    # create a tensor of the sequences
    sequences = torch.stack(sequences)
    
    # create a tensor of the targets
    # targets = torch.stack([data[i+1:i+block_size+1] for i in start])
    targets = torch.stack([data[i+token_size+1] for i in start])
    
    # return the sequences and targets
    return sequences, targets


# lets run get_batch to see if it works
sequences, targets = get_batch(train_data, 10, 2)
sequences.shape, targets.shape


(torch.Size([10, 2]), torch.Size([10]))

In [59]:
# define batch norm, taken from Andrej's video

class BatchNorm1d:
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      if x.ndim == 2:
        dim = 0
      elif x.ndim == 3:
        dim = (0,1)
      xmean = x.mean(dim, keepdim=True) # batch mean
      xvar = x.var(dim, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

#create a linear layer
class Linear:
  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5
    self.bias = torch.zeros(fan_out) if bias else None
  
  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out
  
  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])
  
  def zero_grad(self):
    self.weight.grad = None
    if self.bias is not None:
      self.bias.grad = None

  # define embedding layer
class Embedding:
  def __init__(self, vocab_size, emd_size):
    self.vocab_size = vocab_size
    self.emd_size = emd_size
    self.weight = torch.randn((vocab_size, emd_size)) / vocab_size**0.5
  
  def __call__(self, x):
    self.out = self.weight[x]
    return self.out
  
  def parameters(self):
    return [self.weight]

In [60]:

# block size is same as batch size
# Lets create a simple neural net with one hidden layer
class Simple_Net(nn.Module):
    def __init__(self, vocab_size, block_size, token_size, embedding_size, hidden_size):
        super(Simple_Net, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.block_size = block_size
        self.token_size = token_size
        
        self.embedding = Embedding(self.vocab_size, self.embedding_size)
        self.linear1  =  Linear(self.embedding_size*self.token_size, self.hidden_size)
        # batch norm
        # self.batch_norm = BatchNorm1d(self.hidden_size)
        self.linear2 =  Linear(self.hidden_size, self.vocab_size)

    def forward(self, x):
        B, T = x.shape
        x = self.embedding(x)
        x = x.view(B, self.embedding_size*T)
        x = self.linear1(x)
        # x = self.batch_norm(x)
        x = F.tanh(x)
        x = self.linear2(x)
        return x
    
    def parameters(self):
        return self.embedding.parameters() + self.linear1.parameters() + self.linear2.parameters()
    
    # set the model to compute the gradients
    def train(self):
        self.training = True
        for p in self.parameters():
            p.requires_grad = True

# create a 5 attention head transformer using pytorch
class Transformer_Net(nn.Module):
    def __init__(self, vocab_size, block_size, token_size, embedding_size, hidden_size, num_heads):
        super(Transformer_Net, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.block_size = block_size
        self.token_size = token_size
        self.num_heads = num_heads
        
        self.embedding = Embedding(self.vocab_size, self.embedding_size)
        self.transformer = nn.Transformer(d_model=self.embedding_size, nhead=self.num_heads)
        self.linear2 =  Linear(self.embedding_size, self.vocab_size)

    def forward(self, x):
        B, T = x.shape
        x = self.embedding(x)
        x = x.permute(1, 0, 2)
        x = self.transformer(x)
        x = x.permute(1, 0, 2)
        x = self.linear2(x)
        return x
    
    def parameters(self):
        return self.embedding.parameters() + self.transformer.parameters() + self.linear2.parameters()
    
    # set the model to compute the gradients
    def train(self):
        self.training = True
        for p in self.parameters():
            p.requires_grad = True

# let us run a simple optimization loop

# linear = Linear(batch_size, vocab_size)

# create a simple net
simple_net = Simple_Net(vocab_size, block_size=10, token_size=2, embedding_size=10, hidden_size=100)
# set the model to compute the gradients

simple_net.train()

# print size of simple_net

# simple_net.train = False

# set the model to not compute the gradients
# simple_net.eval()


# check if the model will calculate the gradients
print(simple_net.parameters()[0].requires_grad)

#create an optimizer
# optimizer = optim.Adam(simple_net.parameters(), lr=1e-1)

sequences, targets = (get_batch(train_data, 10,2))
print(sequences.shape)
print(targets.shape)

# run the data through the network
out = simple_net(sequences)
print(out.shape)

# calculate the loss
loss = F.cross_entropy(out, targets)
print(loss)

#manual calculation of loss
counts = out.exp()
prob  = counts / counts.sum(dim=1, keepdim=True)
loss2 = -torch.log(prob[torch.arange(10), targets]).mean()
print(loss2)


In [63]:
# now lets run a for loop to train the model

# re-define the model parameters
block_size = 600
token_size = 8
emb_size = 10
hidden_size = 200
lr = 5e-2

# define the neural net
# simple_net = Simple_Net(vocab_size, block_size=block_size, token_size=token_size, embedding_size=emb_size, hidden_size=hidden_size)

simple_net = Transformer_Net(vocab_size, block_size=block_size, token_size=token_size, embedding_size=emb_size, hidden_size=hidden_size, num_heads=5)

# set the model to compute the gradients
simple_net.train()

# create an optimizer
optimizer = optim.SGD(simple_net.parameters(), lr=lr)
# fix the random seed



TypeError: can only concatenate list (not "generator") to list

In [52]:


# run a for loop for the number of iterations
for i in range(5000):
    g = torch.manual_seed(100)
    # get a batch of data
    sequences, targets = get_batch(train_data, block_size, token_size, g)

    # run the data through the network
    x = simple_net(sequences)

    # calculate the loss
    loss = F.cross_entropy(x, targets)
    
    # zero the gradients
    for p in simple_net.parameters():
        p.grad = None

    # backword pass
    loss.backward()

    # update the weights
    optimizer.step()

    # # if iteration >5000, reduce the learning rate
    # if i > 5000:
    #     for g in optimizer.param_groups:
    #         g['lr'] = lr/2

    # print the loss every 1000 iterations with iteration number
    if i%1000 == 0:
        print(i, loss.item())


#print the loss
print(loss.item())



ValueError: not enough values to unpack (expected 3, got 2)

In [41]:

start_text = "the river "
# if the start text is greater than token_size, take the first token_size characters
if len(start_text) > token_size:
    start_text = start_text[:token_size]
    
length = 5

for i in range(length):
    # take last 8 characters of start_text in x
    x = torch.tensor([chtoi[i] for i in start_text[-token_size:]]).unsqueeze(0)
    out = simple_net(x)
    probs = F.softmax(out, dim=-1) # (B, C)
    idx = torch.multinomial(probs, 1)
    idx = idx.item()
    start_text += itoch[idx]

print(start_text)

# x = torch.tensor([chtoi[i] for i in start_text]).unsqueeze(0)
# x2 = x;
# print(x.shape)
# char_list = list(start_text)

# # ##################
# # print(x)
# # do not calculate the gradient in simple_net

# out = simple_net(x)
# # print(out)
# probs = F.softmax(out, dim=-1) # (B, C)

# #use multinomial to sample from the distribution
# idx = torch.multinomial(probs, 1)
# #tensor to int
# idx = idx.item()
# print(idx)
# itoch[idx]
# print(decode(idx))
# print(itoch[idx])
# print(out.shape)
# print(probs.shape)
# print(probs)
# idx = torch.multinomial(probs, 1)

# # add idx to x2
# x2 = torch.cat((x2, torch.tensor([idx]).unsqueeze(0)), dim=1)
# print(x2.shape)

# print(idx)
# print(itoch[idx])
# char_list.append(itoch[idx])
# # print size of the chars
# print(len(char_list))
# print(len(itoch[idx]))

# # chars.append(itoch[idx])
# # join the characters
# print(''.join(char_list))

# # update the x as last 8 characters of x2
# x = x2[:,-token_size:]
# print(x.shape)




the rive&nteg


In [None]:
# create a function to generate text
def generate_text(model, start_text, length, idx2char, char2idx):
    # create a tensor of the start text
    x = torch.tensor([char2idx[i] for i in start_text]).unsqueeze(0)
    x2 = x;
    # return x
    # create a list of the start text
    chars_list = [i for i in start_text]
    # # run a for loop for the number of characters to generate
    for i in range(length):
        # run the data through the model
        out = model(x)
        # get the index of the character with the highest probability
        idx = torch.argmax(out, dim=1).item()
        # append the character to the list
        chars_list.append(idx2char[idx])
        # append the idx to the x2
        x2 = torch.cat((x2, torch.tensor([idx]).unsqueeze(0)), dim=1)
        # create a tensor of the character from x2 taking last 8 characters
        # x = x2[:, -8:]
    # return the generated text
    return ''.join(chars)

# generate text
print(generate_text(simple_net, 'The', 1, idx2char = itoch, char2idx = chtoi))

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x30 and 80x100)