In [1]:
# !pip install wandb

In [1]:
import wandb
import torch
import torch.nn as nn
from torch.nn import functional as F
from torcheval.metrics.text import Perplexity
from tqdm import tqdm
from attention import GPTLanguageModel
from mamba import MambaLanguageModel
from xlstm import XLSTMLanguageModel
torch.manual_seed(1337)
import gc

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [3]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-08-05 04:10:10--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.3’


2024-08-05 04:10:10 (76.2 MB/s) - ‘input.txt.3’ saved [1115394/1115394]



In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mshetty-sau[0m ([33mshetty-sau-northeastern-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
batch_size = 125 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 500000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
torch.set_default_device(device)
device

'cuda'

In [4]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [5]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [6]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        metric = Perplexity()
        metric.to(device)
        metric.reset()
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            BT, C = logits.shape
            logits = logits.view(batch_size, BT//batch_size, C)
            Y = Y.view(batch_size, BT//batch_size)
            metric.update(logits, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
        out[str(split)+"_perplexity"] = metric.compute()
        del metric
    model.train()
    return out
    
# @torch.no_grad()
# def estimate_loss():
#     out = {}
#     model.eval()
#     for split in ['train', 'val']:
#         losses = torch.zeros(eval_iters)
#         for k in range(eval_iters):
#             X, Y = get_batch(split)
#             logits, loss = model(X, Y)
#             losses[k] = loss.item()
#         out[split] = losses.mean()

#     for split in ['perplexity_train', 'perplexity_val']:
#         metric = Perplexity()
#         metric.to(device)
#         metric.reset()
#         for k in range(eval_iters):
#             X, Y = get_batch(split)
#             logits, loss = model(X, Y)
#             BT, C = logits.shape
#             logits = logits.view(batch_size, BT//batch_size, C)
#             Y = Y.view(batch_size, BT//batch_size)
#             metric.update(logits, Y)
#         out[split] = metric.compute()
#         del metric
#     model.train()
#     return out

In [8]:
model = GPTLanguageModel(vocab_size)
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

10.788929 M parameters


In [9]:
# Parameters for early stopping
wandb.watch(model, optimizer, log="all", log_freq=100)

patience = 5
min_delta = 0.001
best_val_loss = float('inf')
patience_counter = 0

for iter in tqdm(range(max_iters)):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f} train_perplexity {losses['train_perplexity']:.4f}, val_perplexity  {losses['val_perplexity']:.4f}")
        wandb.log({"train_loss": losses['train'], "val_loss": losses['val'], "train_perplexity":losses['train_perplexity'], "val_perplexity":losses['val_perplexity']})

        # Early stopping check
        if losses['val'] < best_val_loss - min_delta:
            best_val_loss = losses['val']
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping at iteration {iter} with best val loss {best_val_loss:.4f}")
            break

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/500000 [00:00<?, ?it/s]

step 0: train loss 4.3219, val loss 4.3331 train_perplexity 75.3341, val_perplexity  76.1789


  0%|          | 500/500000 [09:07<127:21:10,  1.09it/s] 

step 500: train loss 1.5949, val loss 1.7762 train_perplexity 4.9279, val_perplexity  5.9072


  0%|          | 1000/500000 [18:17<126:32:39,  1.10it/s]

step 1000: train loss 1.3015, val loss 1.5502 train_perplexity 3.6746, val_perplexity  4.7123


  0%|          | 1500/500000 [27:27<126:05:20,  1.10it/s] 

step 1500: train loss 1.1783, val loss 1.4867 train_perplexity 3.2488, val_perplexity  4.4225


  0%|          | 2000/500000 [36:36<126:20:17,  1.09it/s] 

step 2000: train loss 1.0862, val loss 1.4884 train_perplexity 2.9629, val_perplexity  4.4302


  0%|          | 2500/500000 [45:43<125:36:55,  1.10it/s] 

step 2500: train loss 1.0016, val loss 1.5003 train_perplexity 2.7227, val_perplexity  4.4831


  1%|          | 3000/500000 [54:50<125:34:17,  1.10it/s] 

step 3000: train loss 0.9117, val loss 1.5317 train_perplexity 2.4887, val_perplexity  4.6260


  1%|          | 3500/500000 [1:03:56<124:56:34,  1.10it/s]

step 3500: train loss 0.8318, val loss 1.5840 train_perplexity 2.2974, val_perplexity  4.8742


  1%|          | 4000/500000 [1:15:12<155:26:31,  1.13s/it] 

step 4000: train loss 0.7435, val loss 1.6289 train_perplexity 2.1032, val_perplexity  5.0982
Early stopping at iteration 4000 with best val loss 1.4867





In [10]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


Black, pioous, use! my gracious right!
That fearful worms' tears will under dryck.

ApoLIXENES:
Pray you, then, let's to see.

ESCALUS:
Where is threat joy? why, sir?

Clown:
Army is't like a sect father's enemy.

ANGELO:
You smell of spoken?

ESCALUS:
No. Well, sir, I like it.

LUCIO:
Good villana.

POMPEY:
Pray, daught you think; and, I can, if you, pray 
by unrespect, or any of the packer is life, I thou shall
goe of your corner, you'll go jest me to unto the noble
of my comfort, and lear me 


# MAMBA

In [17]:
# clear GPU memory
del model
del m
gc.collect()
torch.cuda.empty_cache() 

NameError: name 'model' is not defined

In [7]:
batch_size = 125 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 500000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

model = MambaLanguageModel(vocab_size, batch_size = batch_size, block_size = block_size,
                max_iters = max_iters, eval_interval = eval_interval, learning_rate = learning_rate,
                n_embd = n_embd, n_head = n_head,
                n_layer = n_layer, dropout = dropout)

m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

TypeError: can only concatenate str (not "int") to str

In [None]:
wandb.watch(model, optimizer, log="all", log_freq=100)

# Parameters for early stopping
patience = 5
min_delta = 0.001
best_val_loss = float('inf')
patience_counter = 0

for iter in tqdm(range(max_iters)):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f} train_perplexity {losses['train_perplexity']:.4f}, val_perplexity  {losses['val_perplexity']:.4f}")
        wandb.log({"train_loss": losses['train'], "val_loss": losses['val'], "train_perplexity":losses['train_perplexity'], "val_perplexity":losses['val_perplexity']})

        # Early stopping check
        if losses['val'] < best_val_loss - min_delta:
            best_val_loss = losses['val']
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping at iteration {iter} with best val loss {best_val_loss:.4f}")
            break

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

# xLSTM

In [None]:
# clear GPU memory
del model
delet m
gc.collect()
torch.cuda.empty_cache() 

In [8]:
batch_size = 125 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 500000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
layer = ['m', 'm', 's']
dropout = 0.2
torch.set_default_device(device)

x = torch.zeros(batch_size, block_size, n_embd)

model = XLSTMLanguageModel(vocab_size, x, batch_size = batch_size, block_size = block_size,
                max_iters = max_iters, eval_interval = eval_interval, learning_rate = learning_rate,
                device = device, eval_iters = eval_interval, dropout = dropout, layers=layer)

m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

63.902017 M parameters


In [9]:
wandb.watch(model, optimizer, log="all", log_freq=100)

# Parameters for early stopping
patience = 5
min_delta = 0.001
best_val_loss = float('inf')
patience_counter = 0

for iter in tqdm(range(max_iters)):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f} train_perplexity {losses['train_perplexity']:.4f}, val_perplexity  {losses['val_perplexity']:.4f}")
        wandb.log({"train_loss": losses['train'], "val_loss": losses['val'], "train_perplexity":losses['train_perplexity'], "val_perplexity":losses['val_perplexity']})


        # Early stopping check
        if losses['val'] < best_val_loss - min_delta:
            best_val_loss = losses['val']
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping at iteration {iter} with best val loss {best_val_loss:.4f}")
            break

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/500000 [00:00<?, ?it/s]

step 0: train loss 4.3027, val loss 4.3005 train_perplexity 73.8967, val_perplexity  73.7376


  0%|          | 500/500000 [22:35<368:48:33,  2.66s/it] 

step 500: train loss 2.5712, val loss 2.6183 train_perplexity 13.0820, val_perplexity  13.7124


  0%|          | 1000/500000 [45:12<369:18:06,  2.66s/it]

step 1000: train loss 2.5744, val loss 2.6856 train_perplexity 13.1240, val_perplexity  14.6672


  0%|          | 1500/500000 [1:07:50<371:11:40,  2.68s/it]

step 1500: train loss 2.5180, val loss 2.5409 train_perplexity 12.4032, val_perplexity  12.6912


  0%|          | 2000/500000 [1:30:25<368:01:57,  2.66s/it] 

step 2000: train loss 2.4826, val loss 2.5061 train_perplexity 11.9718, val_perplexity  12.2566


  0%|          | 2500/500000 [1:52:59<366:28:12,  2.65s/it] 

step 2500: train loss 2.4678, val loss 2.4945 train_perplexity 11.7962, val_perplexity  12.1162


  1%|          | 3000/500000 [2:15:35<368:01:42,  2.67s/it] 

step 3000: train loss 2.4610, val loss 2.4873 train_perplexity 11.7168, val_perplexity  12.0292


  1%|          | 3500/500000 [2:38:13<370:54:50,  2.69s/it] 

step 3500: train loss 2.4587, val loss 2.4900 train_perplexity 11.6892, val_perplexity  12.0613


  1%|          | 4000/500000 [3:00:53<367:15:13,  2.67s/it] 

step 4000: train loss 2.4577, val loss 2.4839 train_perplexity 11.6779, val_perplexity  11.9879


  1%|          | 4500/500000 [3:23:32<367:04:14,  2.67s/it] 

step 4500: train loss 2.4563, val loss 2.4838 train_perplexity 11.6615, val_perplexity  11.9863


  1%|          | 5000/500000 [3:46:11<369:01:13,  2.68s/it] 

step 5000: train loss 2.4559, val loss 2.4838 train_perplexity 11.6574, val_perplexity  11.9871


  1%|          | 5500/500000 [4:08:48<365:52:58,  2.66s/it] 

step 5500: train loss 2.4566, val loss 2.4854 train_perplexity 11.6654, val_perplexity  12.0054


  1%|          | 6000/500000 [4:31:24<365:59:29,  2.67s/it] 

step 6000: train loss 2.4553, val loss 2.4839 train_perplexity 11.6503, val_perplexity  11.9885


  1%|▏         | 6500/500000 [4:57:09<376:01:18,  2.74s/it] 

step 6500: train loss 2.4554, val loss 2.4849 train_perplexity 11.6506, val_perplexity  11.9999
Early stopping at iteration 6500 with best val loss 2.4839





In [12]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

TypeError: unhashable type: 'list'