<a href="https://colab.research.google.com/github/smarginatura/makemore/blob/dev/rnn_bs_lr_sweep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/smarginatura/makemore.git

Cloning into 'makemore'...
remote: Enumerating objects: 116, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 116 (delta 38), reused 57 (delta 30), pack-reused 42[K
Receiving objects: 100% (116/116), 315.62 KiB | 3.90 MiB/s, done.
Resolving deltas: 100% (54/54), done.


In [2]:
!pip install wandb -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.2/289.2 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import wandb
from google.colab import userdata
wandb.login(key=userdata.get('WANDB_KEY2'))

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
%cd makemore

/content/makemore


In [6]:
import os, sys, time, math, random

import numpy as np
import torch
from torch.nn import functional as F

from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

from tqdm.auto import tqdm

from model import ModelConfig, RNN, loss_fn
from data_loader import create_datasets, InfiniteDataLoader

# Utils
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Sweep

In [7]:
def train(config=None):

    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        set_seed(config.seed)
        os.makedirs(config.work_dir, exist_ok=True)

        @torch.inference_mode()
        def evaluate(model, dataset, batch_size=50, max_batches=None):
            model.eval()
            loader = DataLoader(dataset, shuffle=True, batch_size=batch_size, num_workers=0)
            losses = []
            for i, batch in enumerate(loader):
                batch = [t.to(config.device) for t in batch]
                X, Y = batch
                logits = model(X)
                loss = loss_fn(logits, Y)
                losses.append(loss.item())
                if max_batches is not None and i >= max_batches:
                    break
            mean_loss = torch.tensor(losses).mean().item()
            model.train() # reset model back to training mode
            return mean_loss

        # init datasets
        train_dataset, test_dataset = create_datasets(config.input_file)
        vocab_size = train_dataset.get_vocab_size()
        block_size = train_dataset.get_output_length()
        print(f"\ndataset determined that: {vocab_size=}, {block_size=}")

        # init model
        model_config = ModelConfig(
            vocab_size=vocab_size,
            block_size=block_size,
            n_layer=config.n_layer,
            n_embd=config.n_embd,
            n_embd2=config.n_embd2)
        model = RNN(model_config, cell_type=config.cell_type)
        model.to(config.device)
        print(f"model #params: {sum(p.numel() for p in model.parameters())}")

        # init optimizer
        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=config.learning_rate,
            weight_decay=config.weight_decay,
            betas=(0.9, 0.99),eps=1e-8,
            )

        # init dataloader
        batch_loader = InfiniteDataLoader(
            train_dataset,
            batch_size=config.batch_size,
            pin_memory=True,
            num_workers=config.num_workers)

        best_loss = None

        for step in tqdm(range(config.max_steps)):

            t0 = time.time()

            batch = batch_loader.next()
            batch = [t.to(config.device) for t in batch]
            X, Y = batch
            logits = model(X)
            loss = loss_fn(logits, Y)
            model.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()
            # wait for all CUDA work on the GPU to finish then calculate iteration time taken
            if config.device.startswith('cuda'):
                torch.cuda.synchronize()

            t1 = time.time()

            # Evaluate the model
            if step > 0 and step % 500 == 0:
                train_loss = evaluate(model, train_dataset, batch_size=100, max_batches=10)
                test_loss  = evaluate(model, test_dataset,  batch_size=100, max_batches=10)
                wandb.log({
                    "train/loss": train_loss,
                    "test/loss": test_loss,
                }, step=step)
                print(f"step {step} train loss: {train_loss:.4f} test loss: {test_loss:.4f}")

                # Save the model to disk if it has improved
                if best_loss is None or test_loss < best_loss:
                    out_path = os.path.join(config.work_dir, "model.pt")
                    print(f"test loss {test_loss:4f} is the best so far, saving model to {out_path}")
                    torch.save(model.state_dict(), out_path)
                    best_loss = test_loss

In [8]:
sweep_config = {
    'method': 'grid',
    'metric': {
        'name': 'test/loss',
        'goal': 'minimize'
        }
    }

parameters_dict = {
        'learning_rate': {
            'values': [1e-3]
            },
        'batch_size': {
            'values': [32, 64, 128, 256, 512]
            },}
parameters_dict.update({
    'n_layer': {'value': 4},
    'n_embd': {'value' : 64},
    'n_embd2': {'value' :64},
    'cell_type': {'value': 'rnn'},
    'weight_decay': {'value': 0.01},
    'input_file': {'value': 'data/names.txt'},
    'work_dir': {'value': 'out'},
    'num_workers': {'value': 2},
    'max_steps': {'value': 50_000},
    'device':{'value':'cuda' if torch.cuda.is_available() else 'cpu'},
    'seed': {'value': 3407},
    'top_k': {'value':  -1},
    })
sweep_config['parameters'] = parameters_dict

In [9]:
import pprint
pprint.pprint(sweep_config)

{'method': 'grid',
 'metric': {'goal': 'minimize', 'name': 'test/loss'},
 'parameters': {'batch_size': {'values': [32, 64, 128, 256, 512]},
                'cell_type': {'value': 'rnn'},
                'device': {'value': 'cuda'},
                'input_file': {'value': 'data/names.txt'},
                'learning_rate': {'values': [0.001]},
                'max_steps': {'value': 50000},
                'n_embd': {'value': 64},
                'n_embd2': {'value': 64},
                'n_layer': {'value': 4},
                'num_workers': {'value': 2},
                'seed': {'value': 3407},
                'top_k': {'value': -1},
                'weight_decay': {'value': 0.01},
                'work_dir': {'value': 'out'}}}


In [10]:
sweep_id = wandb.sweep(sweep_config, project="char-rnn-bs-sweeps-50000-steps")

Create sweep with ID: 3dw7opq1
Sweep URL: https://wandb.ai/polyphony/char-rnn-bs-sweeps-50000-steps/sweeps/3dw7opq1


In [11]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: ghzmu5zs with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	input_file: data/names.txt
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_steps: 50000
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_embd2: 64
[34m[1mwandb[0m: 	n_layer: 4
[34m[1mwandb[0m: 	num_workers: 2
[34m[1mwandb[0m: 	seed: 3407
[34m[1mwandb[0m: 	top_k: -1
[34m[1mwandb[0m: 	weight_decay: 0.01
[34m[1mwandb[0m: 	work_dir: out
[34m[1mwandb[0m: Currently logged in as: [33msmarginatura[0m ([33mpolyphony[0m). Use [1m`wandb login --relogin`[0m to force relogin


number of examples in the dataset: 32033
max word length: 15
number of unique characters in the vocabulary: 26
vocabulary:
abcdefghijklmnopqrstuvwxyz
split up the dataset into 31033 training examples and 1000 test examples

dataset determined that: vocab_size=27, block_size=16
model #params: 11803


  0%|          | 0/50000 [00:00<?, ?it/s]

step 500 train loss: 2.2501 test loss: 2.2297
test loss 2.229686 is the best so far, saving model to out/model.pt
step 1000 train loss: 2.2134 test loss: 2.1800
test loss 2.180049 is the best so far, saving model to out/model.pt
step 1500 train loss: 2.1632 test loss: 2.1539
test loss 2.153885 is the best so far, saving model to out/model.pt
step 2000 train loss: 2.1263 test loss: 2.1346
test loss 2.134612 is the best so far, saving model to out/model.pt
step 2500 train loss: 2.1561 test loss: 2.1211
test loss 2.121068 is the best so far, saving model to out/model.pt
step 3000 train loss: 2.1335 test loss: 2.1119
test loss 2.111885 is the best so far, saving model to out/model.pt
step 3500 train loss: 2.1088 test loss: 2.1016
test loss 2.101563 is the best so far, saving model to out/model.pt
step 4000 train loss: 2.1096 test loss: 2.1020
step 4500 train loss: 2.1060 test loss: 2.0904
test loss 2.090446 is the best so far, saving model to out/model.pt
step 5000 train loss: 2.0930 test 

VBox(children=(Label(value='0.001 MB of 0.009 MB uploaded\r'), FloatProgress(value=0.14678433759383241, max=1.…

0,1
test/loss,█▅▄▄▃▃▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▆▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▂▁▂▂▁▂▁▁▂▁▁▁▂▁▁▂▁▂▂▂

0,1
test/loss,2.03571
train/loss,2.02062


[34m[1mwandb[0m: Agent Starting Run: fwlkvujd with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	input_file: data/names.txt
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_steps: 50000
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_embd2: 64
[34m[1mwandb[0m: 	n_layer: 4
[34m[1mwandb[0m: 	num_workers: 2
[34m[1mwandb[0m: 	seed: 3407
[34m[1mwandb[0m: 	top_k: -1
[34m[1mwandb[0m: 	weight_decay: 0.01
[34m[1mwandb[0m: 	work_dir: out


number of examples in the dataset: 32033
max word length: 15
number of unique characters in the vocabulary: 26
vocabulary:
abcdefghijklmnopqrstuvwxyz
split up the dataset into 31033 training examples and 1000 test examples

dataset determined that: vocab_size=27, block_size=16
model #params: 11803


  0%|          | 0/50000 [00:00<?, ?it/s]

step 500 train loss: 2.2258 test loss: 2.2018
test loss 2.201785 is the best so far, saving model to out/model.pt
step 1000 train loss: 2.1849 test loss: 2.1517
test loss 2.151726 is the best so far, saving model to out/model.pt
step 1500 train loss: 2.1393 test loss: 2.1268
test loss 2.126782 is the best so far, saving model to out/model.pt
step 2000 train loss: 2.1039 test loss: 2.1162
test loss 2.116246 is the best so far, saving model to out/model.pt
step 2500 train loss: 2.1299 test loss: 2.0970
test loss 2.097035 is the best so far, saving model to out/model.pt
step 3000 train loss: 2.1059 test loss: 2.0925
test loss 2.092492 is the best so far, saving model to out/model.pt
step 3500 train loss: 2.0853 test loss: 2.0867
test loss 2.086719 is the best so far, saving model to out/model.pt
step 4000 train loss: 2.0806 test loss: 2.0816
test loss 2.081596 is the best so far, saving model to out/model.pt
step 4500 train loss: 2.0833 test loss: 2.0784
test loss 2.078404 is the best so 

VBox(children=(Label(value='0.001 MB of 0.018 MB uploaded\r'), FloatProgress(value=0.07677217538836753, max=1.…

0,1
test/loss,█▅▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▆▅▄▃▃▃▃▂▂▂▂▁▂▂▂▂▂▁▂▂▂▂▂▁▂▂▁▂▁▁▁▂▁▂▂▂▂▂▂

0,1
test/loss,2.03116
train/loss,2.00082


[34m[1mwandb[0m: Agent Starting Run: r5r50a2w with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	input_file: data/names.txt
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_steps: 50000
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_embd2: 64
[34m[1mwandb[0m: 	n_layer: 4
[34m[1mwandb[0m: 	num_workers: 2
[34m[1mwandb[0m: 	seed: 3407
[34m[1mwandb[0m: 	top_k: -1
[34m[1mwandb[0m: 	weight_decay: 0.01
[34m[1mwandb[0m: 	work_dir: out


number of examples in the dataset: 32033
max word length: 15
number of unique characters in the vocabulary: 26
vocabulary:
abcdefghijklmnopqrstuvwxyz
split up the dataset into 31033 training examples and 1000 test examples

dataset determined that: vocab_size=27, block_size=16
model #params: 11803


  0%|          | 0/50000 [00:00<?, ?it/s]

step 500 train loss: 2.2037 test loss: 2.1787
test loss 2.178662 is the best so far, saving model to out/model.pt
step 1000 train loss: 2.1632 test loss: 2.1350
test loss 2.134984 is the best so far, saving model to out/model.pt
step 1500 train loss: 2.1153 test loss: 2.1073
test loss 2.107256 is the best so far, saving model to out/model.pt
step 2000 train loss: 2.0865 test loss: 2.0926
test loss 2.092555 is the best so far, saving model to out/model.pt
step 2500 train loss: 2.1056 test loss: 2.0845
test loss 2.084538 is the best so far, saving model to out/model.pt
step 3000 train loss: 2.0810 test loss: 2.0721
test loss 2.072052 is the best so far, saving model to out/model.pt
step 3500 train loss: 2.0649 test loss: 2.0670
test loss 2.066991 is the best so far, saving model to out/model.pt
step 4000 train loss: 2.0604 test loss: 2.0598
test loss 2.059821 is the best so far, saving model to out/model.pt
step 4500 train loss: 2.0668 test loss: 2.0587
test loss 2.058656 is the best so 

VBox(children=(Label(value='0.001 MB of 0.009 MB uploaded\r'), FloatProgress(value=0.15316268246244977, max=1.…

0,1
test/loss,█▅▃▃▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▂▂▂▁▂▂▂▂▂▂▂▂

0,1
test/loss,2.02396
train/loss,1.9904


[34m[1mwandb[0m: Agent Starting Run: pthx7gqj with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	input_file: data/names.txt
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_steps: 50000
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_embd2: 64
[34m[1mwandb[0m: 	n_layer: 4
[34m[1mwandb[0m: 	num_workers: 2
[34m[1mwandb[0m: 	seed: 3407
[34m[1mwandb[0m: 	top_k: -1
[34m[1mwandb[0m: 	weight_decay: 0.01
[34m[1mwandb[0m: 	work_dir: out


number of examples in the dataset: 32033
max word length: 15
number of unique characters in the vocabulary: 26
vocabulary:
abcdefghijklmnopqrstuvwxyz
split up the dataset into 31033 training examples and 1000 test examples

dataset determined that: vocab_size=27, block_size=16
model #params: 11803


  0%|          | 0/50000 [00:00<?, ?it/s]

step 500 train loss: 2.1812 test loss: 2.1615
test loss 2.161529 is the best so far, saving model to out/model.pt
step 1000 train loss: 2.1417 test loss: 2.1122
test loss 2.112188 is the best so far, saving model to out/model.pt
step 1500 train loss: 2.0934 test loss: 2.0873
test loss 2.087259 is the best so far, saving model to out/model.pt
step 2000 train loss: 2.0641 test loss: 2.0708
test loss 2.070768 is the best so far, saving model to out/model.pt
step 2500 train loss: 2.0793 test loss: 2.0617
test loss 2.061687 is the best so far, saving model to out/model.pt
step 3000 train loss: 2.0618 test loss: 2.0571
test loss 2.057112 is the best so far, saving model to out/model.pt
step 3500 train loss: 2.0500 test loss: 2.0514
test loss 2.051391 is the best so far, saving model to out/model.pt
step 4000 train loss: 2.0420 test loss: 2.0487
test loss 2.048651 is the best so far, saving model to out/model.pt
step 4500 train loss: 2.0399 test loss: 2.0451
test loss 2.045142 is the best so 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test/loss,█▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▁▂▂▂▁▂▁▂▂▂▂▂▂

0,1
test/loss,2.01692
train/loss,1.98488


[34m[1mwandb[0m: Agent Starting Run: lnni96uq with config:
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	device: cuda
[34m[1mwandb[0m: 	input_file: data/names.txt
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_steps: 50000
[34m[1mwandb[0m: 	n_embd: 64
[34m[1mwandb[0m: 	n_embd2: 64
[34m[1mwandb[0m: 	n_layer: 4
[34m[1mwandb[0m: 	num_workers: 2
[34m[1mwandb[0m: 	seed: 3407
[34m[1mwandb[0m: 	top_k: -1
[34m[1mwandb[0m: 	weight_decay: 0.01
[34m[1mwandb[0m: 	work_dir: out


number of examples in the dataset: 32033
max word length: 15
number of unique characters in the vocabulary: 26
vocabulary:
abcdefghijklmnopqrstuvwxyz
split up the dataset into 31033 training examples and 1000 test examples

dataset determined that: vocab_size=27, block_size=16
model #params: 11803


  0%|          | 0/50000 [00:00<?, ?it/s]

step 500 train loss: 2.1612 test loss: 2.1409
test loss 2.140901 is the best so far, saving model to out/model.pt
step 1000 train loss: 2.1200 test loss: 2.0896
test loss 2.089609 is the best so far, saving model to out/model.pt
step 1500 train loss: 2.0793 test loss: 2.0673
test loss 2.067284 is the best so far, saving model to out/model.pt
step 2000 train loss: 2.0443 test loss: 2.0553
test loss 2.055260 is the best so far, saving model to out/model.pt
step 2500 train loss: 2.0597 test loss: 2.0459
test loss 2.045858 is the best so far, saving model to out/model.pt
step 3000 train loss: 2.0437 test loss: 2.0398
test loss 2.039826 is the best so far, saving model to out/model.pt
step 3500 train loss: 2.0307 test loss: 2.0375
test loss 2.037510 is the best so far, saving model to out/model.pt
step 4000 train loss: 2.0289 test loss: 2.0328
test loss 2.032790 is the best so far, saving model to out/model.pt
step 4500 train loss: 2.0256 test loss: 2.0314
test loss 2.031450 is the best so 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test/loss,█▄▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▅▄▄▃▂▃▂▂▂▂▂▁▂▂▂▂▂▁▂▂▂▂▂▁▂▂▁▂▁▁▁▂▂▂▂▁▂▂▂

0,1
test/loss,2.0169
train/loss,1.97757


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
