In [33]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'mps' if torch.has_mps else 'cpu'
print(device)
block_size = 8
batch_size = 4

mps


In [34]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [35]:
print(len(text))

232309


In [36]:
print(text[:200])

  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW


In [37]:
from pprint import pprint


chars = sorted(set(text))
pprint(chars)
vocab_size = len(chars)

['\n',
 ' ',
 '!',
 '"',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\ufeff']


In [38]:
print(len(chars))

81


In [39]:

string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [40]:
encode('hello')

[61, 58, 65, 65, 68]

In [41]:
decode(encode('hello'))

'hello'

Character Level Tokenizer - 
converts each character to integer equivalents.

in this case, we have very small vocabulary and very large amount of tokens to convert (large number of predictions would be required to generate the text)

Word Level Tokenizer - 
we would map each word in a set of vocabulary to integers.

in this case the vocabulary would be very large, but less number of tokens to encode/decode

Sub-Word Tokenizer - 
in-between character and word level

In [42]:
data = torch.tensor(encode(text), dtype=torch.long)

In [43]:
data

tensor([80,  1,  1,  ..., 29, 67, 57])

In [44]:
data[:100]

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])

In [45]:
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i: i + block_size] for i in ix])
    y = torch.stack([data[i+1: i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs: ')
print(x.shape)
print(x)

print('targets: ')
print(y)

tensor([ 30097,  49156,  42546, 103302])
inputs: 
torch.Size([4, 8])
tensor([[71, 57, 58, 57,  1, 61, 58, 71],
        [65, 72, 11,  0,  0, 33, 67, 72],
        [ 1, 61, 62, 72,  1, 61, 54, 73],
        [73, 71, 68, 73, 73, 58, 57,  1]], device='mps:0')
targets: 
tensor([[57, 58, 57,  1, 61, 58, 71,  1],
        [72, 11,  0,  0, 33, 67, 72, 62],
        [61, 62, 72,  1, 61, 54, 73,  1],
        [71, 68, 73, 73, 58, 57,  1, 68]], device='mps:0')


In [46]:

print(train_data[:block_size+1])

x = train_data[:block_size]
y = train_data[1:block_size+1]


for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print('when input is', context, 'target is', target)

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32])
when input is tensor([80]) target is tensor(1)
when input is tensor([80,  1]) target is tensor(1)
when input is tensor([80,  1,  1]) target is tensor(28)
when input is tensor([80,  1,  1, 28]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39]) target is tensor(42)
when input is tensor([80,  1,  1, 28, 39, 42]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is tensor(32)


In [47]:
# Adam and AdamW are just some features that we add on to gradient descent
# Adam - is a popular optimization algorithm that combines ideas of momentum. It uses a moving average of both the gradient and its squared value to adapt the learning rate of each parameter.
# AdamW - is a modification of Adam optimizer and it adds weight decay. It generalizes the parameters more. So, instead of having very high level performance or very low level, it takes a little generalize in between.
# the weight significance will actually shrink as it flattens out. Basically prevent over-fitting

In [48]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    def forward(self, index, targets = None):
        logits = self.token_embedding_table(index)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


1if*i]?sAlIk2hD&u9
-)_oiz0;aj&X['52p.I&,K2zkiNvXwPgCnYqv56HJhuA0WO_vUx6gkT6EgQmOAMr[r4FamVzMg0cwnL;J-Qi'GpPPx0Sq5gwRDw-Kp P4]DGQ.,Byt)O'p.9d_G,QJLd,aQ3mQ&XwsjyKhK2vgZG!0XZDlO"DG2JT6xwv?b2-&JD6pf2?"&ctm2IBu-0gFN?157zZ 4
I)LeZB'AZ
QHI&r[1hr(*2:K2hcd8&eXh1-
Net_oGzDGFql]pDG&e﻿v2?I!jd6﻿IE06ziMFN58LGLk8Rrw1UWrSlrB):_-K5[1MG[7(a];rSM*NqcBL﻿IMrsHlfcaCwn
114
u!s_okeclDG﻿V)-dY[RPYu'﻿57ADxZ2X;rv,3P1&mLY9Wob[LR2﻿U?SjyjUA;vU9﻿WF,nnq7a]* LQTVD7WF rd&fxvKHq[-'y ;4,2QZ5Jc_W-;1Y[O0W;Vsvp.]8!9nL,
YZk?b﻿d﻿9 i-K;K
