In [2]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/shwetank/code/wiki-base')
import torch
# from datasets import load_dataset
# import tiktoken

# Setup device
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    # Check if MPS is available
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        # Fall back to CPU
        device = torch.device("cpu")

print("Device selected:", device)

Device selected: cuda


In [14]:
## Get dataset from HF
from datasets import load_dataset
dataset = load_dataset('wikitext','wikitext-2-v1')

Downloading data: 100%|██████████| 685k/685k [00:00<00:00, 7.28MB/s]
Downloading data: 100%|██████████| 6.07M/6.07M [00:00<00:00, 34.9MB/s]
Downloading data: 100%|██████████| 618k/618k [00:00<00:00, 7.88MB/s]
Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 404558.83 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 759531.74 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 600692.58 examples/s]


In [3]:
# Hyperparams
batch_size = 128
block_size = 256
emb_dim = 384
num_heads = 6
num_layers = 6
dropout = 0.2
linear_scaleup = 4
temperature = 1

# Optimization loop parameters
n_epochs = 100
eval_interval =10
eval_iters = 4

In [1]:
# Model files if they exist
import os, torch
from models import XformerScratch as Xformer

model_path = './wiki-2_model.pt'
    
# Data files
with open('./data/wiki-2/input.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

vocab = sorted(set(list(text_data)))
n_vocab = len(vocab)
print(f"vocab: {vocab}")
print(f"vocab length: {n_vocab}")

FileNotFoundError: [Errno 2] No such file or directory: './data/wiki-2/input.txt'

In [5]:
stoi = {s: i for i,s in enumerate(vocab)}
itos = {i:s for s,i in stoi.items()}
# print(stoi)
# print(itos)
def encode(text):
    tokens = torch.tensor([stoi[t] for t in text])
    return tokens

def decode(tokens):
    text = ''.join(itos[t] for t in tokens)
    return text

encoded_data = encode(text_data)
data = encoded_data.clone().detach()
n1 = int(0.9*len(text_data))
train_data = data[:n1]
val_data = data[n1:]
dataset = {'train': train_data, 'val': val_data}


# tokens = encode('Hello world!')
# print(decode(tokens))
# print(data[0:100])

In [6]:
from utils import get_batch
xtr, ytr = get_batch(dataset['train'], device, block_size, batch_size)
xtr.shape, ytr.shape

(torch.Size([128, 256]), torch.Size([128, 256]))

In [7]:
# ##TODO: Replace this class with just a function to get the next item
# from torch.utils.data import Dataset
# import torch

# class CharacterDataset(Dataset):
#     def __init__(self, dataset: Dataset, block_size: int=128, batch_size: int=64):
#         self.block_size = block_size
#         self.dataset = dataset
#         self.batch_size = batch_size
        
#     def __len__(self) -> int:
#         return len(self.dataset)

#     def __getitem__(self,idx) -> tuple[torch.tensor, torch.tensor]:
#         idx = torch.randint(len(self.dataset) - self.block_size, (self.batch_size,))
#         x = torch.stack([self.dataset[i:i+self.block_size] for i in idx])
#         y = torch.stack([self.dataset[i+1:i+self.block_size+1] for i in idx])
#         return x.to(device), y.to(device)

In [8]:
# ## Initialize train and val CharacterDataset
# train_ds = CharacterDataset(train_data,block_size=block_size,batch_size=batch_size)
# val_ds = CharacterDataset(val_data,block_size=block_size,batch_size=batch_size)

## Check data sampling by data loader
# x_toks, y_toks = next(iter(train_ds))
x_toks, y_toks = get_batch(dataset['train'], device, block_size, batch_size)
txt = [''.join(decode(t)) for t in x_toks.tolist()]
for row in txt:
    print(row)
    print('================')

blood warm'd, that sting my heart!
Three Judases, each one thrice worse than Judas!
Would they make peace? terrible hell make war
Upon their spotted souls for this offence!

SIR STEPHEN SCROOP:
Sweet love, I see, changing his property,
Turns to the sourest
g: my master is the
great rich Capulet; and if you be not of the house
of Montagues, I pray, come and crush a cup of wine.
Rest you merry!

BENVOLIO:
At this same ancient feast of Capulet's
Sups the fair Rosaline whom thou so lovest,
With all the admired b
n a traducement,
To hide your doings; and to silence that,
Which, to the spire and top of praises vouch'd,
Would seem but modest: therefore, I beseech you
In sign of what you are, not to reward
What you have done--before our army hear me.

MARCIUS:
I have 
--
Both to defend my loyalty and truth
To God, my king and my succeeding issue,
Against the Duke of Hereford that appeals me
And, by the grace of God and this mine arm,
To prove him, in defending of myself,
A traitor to my God, m

In [10]:
from models import XformerScratch as Xformer
# Model initialization from scratch or checkpoint
if os.path.exists(model_path):
    # Save the model
    model = Xformer(emb_dim, n_vocab, num_heads, num_layers, block_size, dropout).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    print(f"Model loaded from checkpoint: {model_path}")
else:
    print("Model does not exist.")

Model loaded from checkpoint: ./shakespeare_model.pt


In [None]:
# Estimate loss at initialization and check nothing breaks when data run through the model
from utils import get_batch

# xb, yb = next(iter(train_ds))
# xb, yb = get_batch(dataset['train'], device,block_size, batch_size)
# print(xb.shape, yb.shape)
# model = Xformer(emb_dim, n_vocab, num_heads, num_layers, dropout).to(device)
xb, yb = get_batch(dataset['train'], device, block_size, batch_size)
# model = Xformer(emb_dim, n_vocab, num_heads, num_layers, block_size, dropout).to(device)

logits, loss = model(xb,yb)
print('loss at initialization: ',round(loss.item(),4))

In [None]:
from utils import get_model_size
print('Number of parameters in the model: ', get_model_size(model))

In [None]:
# # ## Optimal lr sweep
# from torch.optim import Adam
# from utils import get_lr_loss
# num_epochs = 100
# lr = 0
# optimizer = Adam(model.parameters(), lr=lr)
# lri, lossi =  get_lr_loss(model, optimizer, dataset['train'], num_epochs, device, block_size, batch_size, -5, -3)
# import matplotlib.pyplot as plt
# plt.plot(lri, lossi)
# # Add labels to the x-axis and y-axis
# plt.xlabel('LR (Learning Rate)')
# plt.ylabel('Loss')

In [None]:
tr_loss = []
vl_loss = []

In [None]:
# xb, yb = next(iter(train_dataloader))
# print(xb)
# xb, yb = next(iter(train_dataloader))
# print(xb)
# xb, yb = next(iter(val_dataloader))
# print(xb)
# xb, yb = next(iter(val_dataloader))
# print(xb)

In [None]:
# from utils import evaluate_loss_char_ds
from utils import estimate_loss
from torch.optim import Adam
lr = 1e-4
optimizer = Adam(model.parameters(), lr=lr)
for step in range(1000):
    # sample a batch of data
    xb, yb = get_batch(dataset['train'], device, block_size, batch_size)
    # xb, yb = next(iter(train_ds))
    # xb = xb.to(device)
    # yb = yb.to(device)
    # get_batch(dataset['train'], device, block_size, batch_size)
    logits, loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    # tr_lossi, te_lossi = evaluate_loss_char_ds(model, train_ds, val_ds, num_batches=1)
    losses = estimate_loss(model, block_size, batch_size, train_data, val_data, device, eval_iters)
    # print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    tr_loss.append(losses['train'])
    vl_loss.append(losses['val'])
    # print(loss.item())
    # print(steps,':', 'tr_loss: ', tr_lossi, 'te_loss: ', te_lossi, 'single shot loss:', loss.item())

    if step % eval_interval == 0:
        print(step,' --> tr_loss: ', tr_loss[-1].item(), 'te_loss: ', vl_loss[-1].item(), 'single shot loss:', loss.item())

In [None]:
## Plot loss 
import matplotlib.pyplot as plt

plt.figure()
plt.plot(tr_loss, label='Training Loss')
# plt.figure()
plt.plot(vl_loss, label='Validation Loss')
plt.legend()
plt.show()
print('training loss: ', round(torch.mean(torch.tensor(tr_loss[-10:])).item(),4)), 
print('validation loss: ', round((torch.mean(torch.tensor(vl_loss[-10:]))).item(),4))

In [11]:
import torch.nn.functional as F
@torch.no_grad()
def generate(model, max_new_tokens=block_size, batch_size=batch_size, temperature=temperature):
    idx = torch.zeros((batch_size, 1), dtype=torch.long).to(device)
    for _ in range(max_new_tokens):
        # print('idx shape:',idx.shape)
        idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
        idx_cond = idx_cond.to(device)
        logits, _ = model(idx_cond)
        # Pick only the logits from most recent time step. Karpathy also does a divide by temp?
        # This is just Platt scaling which makes the various Softmax curves closes adding more randomness
        # see scratch.ipynb. https://en.wikipedia.org/wiki/Platt_scaling
        logits = logits[:,-1,:]/temperature
        probs = F.softmax(logits, dim=-1)
        # print('prob dist:',probs)
        idx_next = torch.multinomial(probs, num_samples=1)
        # print('idx_next shape:',idx_next.shape)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx


In [18]:
X_samp = generate(model, 512, 1, temperature).tolist()

In [19]:
for row in X_samp:
    print(decode(row))



RCI:
Nevo: never god nay show inger.

Nurse:
My lorder, what's or's merises:
But, I put that the theroy uncle.

Redone!

MERCHIONIUS:
Willl me that I'll bock, for etwent,
I am inforce; the curtion frest Capinity,
But the reason.

Event:
My they lods: be never for your's love saiden,
In cambing the ofainest.

BENVevosl me as so, what comest liken, I shieve they
To his crangeredly his crown, Poombersle low;
Hencee casion pobrity your greamently of
And wourthing-on staged you, grass, ay, houre me to stone,
As


In [20]:
model_path = "shakespeare_model.pt"

# Save the model
torch.save(model.state_dict(), model_path)