In [13]:
#Download as input.txt: https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# !pip install tiktoken #https://github.com/openai/tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp311-cp311-macosx_11_0_arm64.whl (761 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m761.4/761.4 kB[0m [31m851.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting regex>=2022.1.18
  Downloading regex-2023.6.3-cp311-cp311-macosx_11_0_arm64.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.0/289.0 kB[0m [31m78.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: regex, tiktoken
Successfully installed regex-2023.6.3 tiktoken-0.4.0


In [1]:
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
from itertools import chain #Deal with nested list


## Reading text data

In [2]:
# Reading file 
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
vocab_size_manual=len(list(set(text))) #Character level tokenization
vocab_size_manual

65

## Tokenization (Tiktoken, char level)

#### Tiktoken

In [4]:
# Tokenize: Convert raw string into set of integers according to some vocabulary
# Small vocab means large set of integers and vice versa
enc = tiktoken.encoding_for_model("gpt-4")
print(enc.encode('hi there'))
print(enc.decode([6151,1070]))

[6151, 1070]
hi there


In [5]:
vocab_size_tiktoken = len(set(enc.encode(text)))
vocab_size_tiktoken

12111

In [6]:
max(set(enc.encode(text)))

100252

#### Manually build vocab and encode it

In [10]:
# All the unique characters that appear in the text
chars=sorted(list(set(text)))
print(''.join(chars))
vocab_size=len(chars)
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [11]:
stoi={}
for i,ch in enumerate(chars):
    stoi[ch]=i

itos={}
for i,ch in enumerate(chars):
    itos[i]=ch

encode = lambda s: [stoi[c] for c in s] #Creating a function to output a list of integers for encoding
decode = lambda s: ''.join([itos[c] for c in s]) #Creating a function to output a list of integers for decoding

In [12]:
print(encode('hi there'))
print(decode([46, 47, 1, 58, 46, 43, 56, 43]))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


## Building torch tensor

In [13]:
# data=torch.tensor(encode(text),dtype=torch.long)
# print(data.shape,data.dtype)

data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape,data.dtype)

torch.Size([1115394]) torch.int64


In [14]:
data[:10]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])

In [15]:
decode(data[:10].tolist())

'First Citi'

## Train Test Split

In [16]:
n=int(0.9*len(data))
train_data=data[:n]
test_data=data[n:]

In [17]:
block_size=8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

## Transformer Training
* While trainig we dont pass all the data at once, we train using block (or chunks of data). Context_length/block size
* Train it to predict at every position word
* In the context of 5451, 47317 comes next. In the context of 5451 and 47317, 512 comes next. So we have 8 positions.
* When we train we pass in multiple chunks of text stacked up, just so we keep GPUs busy (leverage parallel processing)


In [18]:
x=train_data[:block_size]
y=train_data[1:block_size+1]

for i in range(block_size):
    context = decode(x[:i+1].tolist())
    target = decode([y[i].tolist()])
    print(f''' Context: {context}    Target: {target} ''')


 Context: F    Target: i 
 Context: Fi    Target: r 
 Context: Fir    Target: s 
 Context: Firs    Target: t 
 Context: First    Target:   
 Context: First     Target: C 
 Context: First C    Target: i 
 Context: First Ci    Target: t 


In [19]:
batch_size =  4
block_size =  8 #Context Length
def get_batch(split):
    data = train_data if split=='train' else test_data
    ix = torch.randint(len(data)-block_size,(batch_size,))  #Gives us random indexes
    x = torch.stack([data[i:i+block_size] for i in ix]) #Get consecutive characters of block_size, for each batch
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) 
    return x, y

In [20]:
print(torch.randint(len(data)-block_size,(batch_size,)))
print(max(data.tolist()))
print(len(data))
print()

tensor([1078327,  453969,   41646,  671252])
64
1115394



In [21]:
xb, yb=  get_batch('train')

print('Context',xb) #block_size is context
print('Trarget',yb)

for j in range(batch_size):
    for i in range(block_size):
        # context=enc.decode(list(chain.from_iterable([xb[i][:i+1]].tolist())))
        # target= enc.decode([yb[j,i].tolist()]) 
        print('Context:',decode(xb[j][:i+1].tolist()))
        #print('Target:',decode([yb[j][i]])) When using tiktoken
        print('Target:',decode([yb[j][i].tolist()]))


Context tensor([[57, 43, 60, 43, 52,  1, 63, 43],
        [60, 43, 42,  8,  0, 25, 63,  1],
        [56, 42,  5, 57,  1, 57, 39, 49],
        [43, 57, 58, 63,  6,  1, 58, 46]])
Trarget tensor([[43, 60, 43, 52,  1, 63, 43, 39],
        [43, 42,  8,  0, 25, 63,  1, 45],
        [42,  5, 57,  1, 57, 39, 49, 43],
        [57, 58, 63,  6,  1, 58, 46, 47]])
Context: s
Target: e
Context: se
Target: v
Context: sev
Target: e
Context: seve
Target: n
Context: seven
Target:  
Context: seven 
Target: y
Context: seven y
Target: e
Context: seven ye
Target: a
Context: v
Target: e
Context: ve
Target: d
Context: ved
Target: .
Context: ved.
Target: 

Context: ved.

Target: M
Context: ved.
M
Target: y
Context: ved.
My
Target:  
Context: ved.
My 
Target: g
Context: r
Target: d
Context: rd
Target: '
Context: rd'
Target: s
Context: rd's
Target:  
Context: rd's 
Target: s
Context: rd's s
Target: a
Context: rd's sa
Target: k
Context: rd's sak
Target: e
Context: e
Target: s
Context: es
Target: t
Context: est
Ta

## Baseline Bi Gram Model

In [21]:
# Tiktoken
# token_embedding_table = nn.Embedding(enc.n_vocab,enc.n_vocab)
# token_embedding_table

Embedding(100277, 100277)

### Why we build models when we can leverage tik token for embeddings?
    * Embedding are not talking to themselves unless we train them. Currently they can just see themselves       

In [22]:
xb

tensor([[57, 43, 60, 43, 52,  1, 63, 43],
        [60, 43, 42,  8,  0, 25, 63,  1],
        [56, 42,  5, 57,  1, 57, 39, 49],
        [43, 57, 58, 63,  6,  1, 58, 46]])

In [23]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # Builds an embedding of vocab size with some weights initialized

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C), Batch Time Channels: 4, 8, vocab size # Filtering embedding for specific indices, pluck out a row of token(integer) from embedding space
        if targets==None:
            loss=None
        else:
            B, T, C = logits.shape
            logits=logits.view(B*T, C)
            targets=targets.view(B*T) #Stretch out the tensor
            loss = F.cross_entropy(logits,targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        #idx is current B X T 
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:,-1,:] # Focusing on last character, this makes the model bi-gram model
            probs = F.softmax(logits, dim=-1) # Converting logits to prob
            idx_next = torch.multinomial(probs,num_samples=1)
            idx = torch.cat((idx,idx_next),dim=1)
        return idx




m = BigramLanguageModel(vocab_size_manual)
logits, loss = m(xb, yb)


In [24]:
loss

tensor(4.8549, grad_fn=<NllLossBackward0>)

In [25]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=20)[0].tolist()))


VMPFsPHSYAHIkPXroxdH


In [26]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [27]:
batch_size = 32
for steps in range(40000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.508910655975342


In [66]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))



Th dst it wife acoupat ayo S ly me o n;
CHE glewetlvek ju gh thasold;
AUn,
LEROLout in my celay ck od Whe aver ze,
Th lshoooo CHe d,
Fime INRowor m ttharoneroucaulenin lo mprinallay,
Thionghout ckererurd t s t ifine, chorn,
If,
RDUCESSifop ar t berdwice'ell t mp pru monyouroos, di hero ns s myoost fol. and bak hyoupanokicit ay
The to talyollous goviton bl IO, ng thigreper fin ope, he hul kispethine's!
Thtte mou cther ES:

Anoucofod thoo ce bl avishthe wind, y or

N so flootinceaned bl atce ck n
