In [1]:
import numpy as np
import matplotlib.pyplot as plt

import requests
import re
import string

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
#GPT4s tokenizer
import tiktoken
tokenizer = tiktoken.get_encoding('cl100k_base')

hyperparameters

In [3]:
# data hyperparas
seq_len = 8 # aka context length
stride = 2
n_vocab = tokenizer.n_vocab

#model hyperparas
embed_dim = 2**6 #64

batch_size=5

In [4]:
# get data
# tokenize the text
text = requests.get('https://www.gutenberg.org/files/35/35-0.txt').text
tmTokens = torch.tensor(tokenizer.encode(text))
len(tmTokens)


43053

In [5]:
# create a class for a dataset
class TokenDataset(Dataset):
    def __init__(self, tokens, seq_length=8, stride=4):

        # init
        self.inputs = []
        self.targets = []
        

        # overlapping seq of context_length
        for i in range(0,len(tokens)-seq_length,stride):
            # get context tokens and append to lists
            self.inputs.append(tokens[i : i+seq_length])
            self.targets.append(tokens[i+1 : i+seq_length+1])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self,idx):
        return self.inputs[idx], self.targets[idx]

# create an instance
token_dataset = TokenDataset(tmTokens, seq_len, stride)

print(token_dataset[12345])
print(tokenizer.decode(token_dataset[12345][0].tolist()))

(tensor([1820,  832,  358, 1047, 3970, 3485, 5015,  304]), tensor([ 832,  358, 1047, 3970, 3485, 5015,  304,  279]))
the one I had seen above ground in


The model

In [10]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        #embedding matrix
        self.embedding = nn.Embedding(tokenizer.n_vocab, embed_dim)
        self.positions = nn.Embedding(seq_len, embed_dim) # we now have position embeddins
        
        #unembedding(linear layer), non linearity and layernorm
        self.gelu = nn.GELU() # non linearity
        self.layernorm = nn.LayerNorm(embed_dim) #we normalise the embeddings dimension
        
        self.finalLinear = nn.Linear(embed_dim, tokenizer.n_vocab, bias = False) # unembed layer
        #the above finaLLinear unembed has random weights
        # we replace the weights and tie it to token embeddings
        #so we tie unembeddings to embeddings matrix
        self.finalLinear.weight = nn.Parameter(self.embedding.weight)
        # Internally Linear layer stores weight as [out_features, in_features]
        #so this unembedd is stored as [n_vocab X embed_dim], hence the tying works 
        # as it has same shape as embedding matrix
        # and during calculations, this uses its transpose (y = xWT)
    def forward(self, tokx):

        #fwd pass
        # create token+position embedding
        token_embed = self.embedding(tokx) # out has shape [batch, numtoken, embed_dim]
        posit_embed = self.positions(torch.arange(tokx.shape[-1])) #[numtokens,embeddingdim
        
        #their sum is the ouput of embeddings (the addition will broadcast for 
        x = token_embed + posit_embed #[batch, numtokens,emebddding_dims]
        

        x=self.layernorm(x) #layernorm before linear layer

        #fwd pass
        x = self.gelu(x)
        x = self.finalLinear(x) / np.sqrt(embed_dim) # logits are scaled

        return x

    def generate(self, tokx,temperature=1, n_new_tokens=50):
        # tokx is [batch, tokens]

        for _ in range(n_new_tokens):

            # get predictions
            x = self(tokx[:,-seq_len:]) # [batch, seq_len,n_vocab]
            #model pushes into feed fwd task only the most recent 8 tokens
            # begining it start with first 8 tokens you started with
            # then it adds more and more tokens that model generates
            # so, it generates new tokens based on tokens that itself has generated

            
            # extract the final token to predict the next
            x = x[:,-1,:]  # [batch, vocab_size]
            

            # apply softmaxt to get prob values over all tokens in vocab - with temp
            probs = F.softmax(x/temperature,dim=-1)

            #probabilistically sample from distbn
            tokx_next = torch.multinomial(probs, num_samples=1) # [batch,1]
            # print("next token:",tokenizer.decode([tokx_next]))
            #append 
            tokx = torch.cat((tokx, tokx_next),dim=1) #[batch, (tokens+1)]
        return tokx
        

In [11]:
# create a model instance and test
m = Model()
X,y = token_dataset[4]
out = m(X)

print(X.shape)
print(y.shape)
print(out.shape) #seq_len X n_vocab

torch.Size([8])
torch.Size([8])
torch.Size([8, 100277])


In [19]:
print(X)
print(tokenizer.decode(X.tolist()))


tensor([ 9745,    38,   469, 37725,   220,  1758, 17601,   881])
BERG EBOOK 35 ***




In [13]:
print(f'Expected loss for random weights {-np.log(1/tokenizer.n_vocab):.3f}')
#this is pure by chance, so -ve log likelihood

print(f'Observedd mean logsoftmax output: {torch.mean(-F.log_softmax(out.detach(),dim=-1)):.3f}')
#take the output of model and take log softmax, avg of all observed results

print(f'Cross entropy loss from pytorch  {F.cross_entropy(out.view(-1,out.shape[-1]), y.view(-1)):.3f}')


#all3 are close

Expected loss for random weights 11.516
Observedd mean logsoftmax output: 11.718
Cross entropy loss from pytorch  11.666


In [23]:
P = m.positions(torch.arange(seq_len))
T = m.embedding(X)

print(f'Token embedd matrix ({T.shape})')
print(f'\nPosition embedd matrix ({P.shape})')
print(f'\n Their sum: ({(T+P).shape})')



Token embedd matrix (torch.Size([8, 64]))

Position embedd matrix (torch.Size([8, 64]))

 Their sum: (torch.Size([8, 64]))


Generate text in batches

In [24]:
dataloader = DataLoader(
    token_dataset,
    batch_size = batch_size,
    shuffle=True,
    drop_last=True
)


X,y = next(iter(dataloader))
print(f'Inputs ({batch_size} batches X {seq_len} tokens):')
print(X)

Inputs (5 batches X 8 tokens):
tensor([[  757,    11,   323,   279,  8613, 36036,   449,   433],
        [ 1603, 16163,  7394,    30,  5112, 15187,   358,   574],
        [  438,   814,  1550,   539,  2873,   311,   617,   904],
        [ 3304, 26840, 73170,   358,  6818,   311, 13471,   279],
        [  358,  1047, 45536,   832,  2697,   319,  1626,    13]])


In [26]:
# get model outputs (logits)
out = m(X) 
print(out.shape)  # [batch,numtokens, n_vocab]
print('\n',out)

torch.Size([5, 8, 100277])

 tensor([[[ 0.6916,  0.8303,  0.0741,  ..., -0.6397,  0.5897,  0.5863],
         [ 0.3359, -0.1192,  0.6246,  ..., -0.0387, -0.5605,  1.1083],
         [ 0.2078,  0.9948, -0.6127,  ..., -0.2006,  0.7970, -0.0209],
         ...,
         [-1.1215,  0.4887,  0.8982,  ...,  0.0308,  0.2839,  0.6013],
         [ 0.5007, -0.6834,  0.1465,  ..., -1.3166,  0.1483,  0.9484],
         [ 0.4180, -0.4155,  1.4094,  ...,  0.5210, -0.3372,  0.2669]],

        [[ 0.7613,  0.0244, -0.0671,  ...,  0.4244,  0.1719,  0.1136],
         [ 0.6715,  0.9610,  0.7697,  ...,  0.1970,  0.1659,  0.2068],
         [ 0.5186,  0.8277, -0.0779,  ..., -0.2668,  0.0463,  0.2892],
         ...,
         [-0.6523, -0.2643,  0.7809,  ..., -1.0318,  0.4271,  0.5719],
         [ 0.4634, -0.5030, -1.0082,  ..., -0.5154,  0.3406,  0.5008],
         [-0.2112, -0.3587,  1.0255,  ..., -0.3534,  0.1095,  0.4815]],

        [[ 0.6104,  0.5635, -0.2061,  ..., -0.8930, -0.2593, -0.5655],
         [ 0.916

In [27]:
# generate some data
gen_tokens = m.generate(X,temperature=1.3, n_new_tokens=8)
print(gen_tokens.shape) # [batch, (tokens+n_new_tokens)]

torch.Size([5, 16])


In [29]:
for batchtok in gen_tokens:
    print('\n----NEXT SAMPLE----\n')
    print(tokenizer.decode(batchtok.tolist()))


----NEXT SAMPLE----

 me, and the Morlocks with it Sky Essen']]
_experience proves\ActiveForm Watopleft

----NEXT SAMPLE----

 before lunch-time? Then suddenly I was Composite_closedï¿½ combin/global cheaper_member.flag

----NEXT SAMPLE----

and they did not seem to have anyMETA adhesive nerve negotiating-incWIDTHicamenteods

----NEXT SAMPLE----

.
Very calmly I tried to strike theapidddfConflict Removal dont856 Maxwelladir

----NEXT SAMPLE----

 I had overlooked one little
thing.Incytut]=$ criticalano Geek_seq_catalog


In [32]:
tokenizer.decode(X[4].tolist())

' I had overlooked one little\r\nthing.'

In [35]:
torch.arange(X.shape[-1])

tensor([0, 1, 2, 3, 4, 5, 6, 7])

In [38]:
X

tensor([[  757,    11,   323,   279,  8613, 36036,   449,   433],
        [ 1603, 16163,  7394,    30,  5112, 15187,   358,   574],
        [  438,   814,  1550,   539,  2873,   311,   617,   904],
        [ 3304, 26840, 73170,   358,  6818,   311, 13471,   279],
        [  358,  1047, 45536,   832,  2697,   319,  1626,    13]])