## ShakespeareGPT

> based on [Let's build GPT: from scratch, in code, spelled out.](https://www.youtube.com/watch?v=kCc8FmEb1nY)

In [63]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from dataclasses import dataclass

In [71]:
@dataclass
class Config:
    block_size = 8 # context-length
    batch_size = 4 # mini-batch size

# Preparing Data

In [10]:
with open('./dataset/shakespeare.txt','r',encoding='utf-8') as f:
    data = f.read()
    
print(f"{len(data)=}\n{data[:100]}")

len(data)=1114985
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## Tokenizer

In [36]:
class CharacterLevelTokenizer:
    def __init__(self,data):
        self.data = data
        self.vocab = sorted(list(set(self.data)))
        self.VOCAB_SIZE = len(self.vocab)
        
        self.i_s = {i:s for i,s in enumerate(self.vocab)}
        self.s_i = {s:i for i,s in self.i_s.items()}
        
    def encode(self,s):
        return torch.tensor([self.s_i[c] for c in s],dtype=torch.long)

    def decode(self,s):
        return ''.join([self.i_s[i.item()] for i in s])

In [37]:
tokenizer = CharacterLevelTokenizer(data)
print(tokenizer.vocab)
print(tokenizer.VOCAB_SIZE)

['\n', ' ', '!', "'", ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
62


In [38]:
tokenizer.encode('et tu brute?')

tensor([40, 55,  1, 55, 56,  1, 37, 53, 56, 55, 40,  9])

In [39]:
tokenizer.decode(tokenizer.encode('et tu brute?'))

'et tu brute?'

In [57]:
class ShakespeareDataset:
    def __init__(self,block_size:int, is_test=False) -> None:
        self.tokenizer = CharacterLevelTokenizer(data)
        self.is_test = is_test
        self.full_data = self.tokenizer.encode(self.tokenizer.data)
        if self.is_test:
            self.data = self.full_data[int(0.9*len(self.full_data)):]
        else:
            self.data = self.full_data[:int(0.9*len(self.full_data))]
        self.block_size = block_size

    def __len__(self) -> int:
        return len(self.data)

    def get_block_size(self) -> int:
        return self.block_size

    def get_vocab_size(self) -> int:
        return self.tokenizer.VOCAB_SIZE

    def __getitem__(self,idx):
        item = self.data[idx:idx+self.block_size+1]
        x = item[:-1]
        y = item[1:]
        return x,y

In [73]:
train_ds = ShakespeareDataset(Config.block_size)
print(f'{train_ds.get_block_size()=}\n{train_ds.get_vocab_size()=}\n{len(train_ds)=}')

val_ds = ShakespeareDataset(Config.block_size,is_test=True)
print(f'{len(val_ds)=}')

train_ds.get_block_size()=8
train_ds.get_vocab_size()=62
len(train_ds)=1003486
len(val_ds)=111499


In [74]:
train_dl = torch.utils.data.DataLoader(train_ds,shuffle=False,batch_size=Config.batch_size)

In [75]:
next(iter(train_dl))

[tensor([[15, 44, 53, 54, 55,  1, 12, 44],
         [44, 53, 54, 55,  1, 12, 44, 55],
         [53, 54, 55,  1, 12, 44, 55, 44],
         [54, 55,  1, 12, 44, 55, 44, 61]]),
 tensor([[44, 53, 54, 55,  1, 12, 44, 55],
         [53, 54, 55,  1, 12, 44, 55, 44],
         [54, 55,  1, 12, 44, 55, 44, 61],
         [55,  1, 12, 44, 55, 44, 61, 40]])]