# alternate-models

> In this notebook, I train three alternate versions of the model starting from different seeds and stopping when I get to approximately the same loss as the main model (train loss = 0.9334, validation loss = 1.5063). 

In [None]:
# | hide
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
from datetime import datetime
from functools import partial

In [None]:
#| hide
from fastcore.test import *
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from tqdm.auto import tqdm

In [None]:
# | hide

from transformer_experiments.dataset_split import split_text_dataset
from transformer_experiments.datasets.tinyshakespeare import (
    TinyShakespeareDataSet,
)
from transformer_experiments.environments import get_environment
from transformer_experiments.models.transformer import (
    block_size,
    TransformerLanguageModel
)
from transformer_experiments.models.transformer_training import (
    batch_size,
    estimate_loss,
    eval_interval,
    eval_iters,
    get_batch,
)
from transformer_experiments.tokenizers.char_tokenizer import CharacterTokenizer
from transformer_experiments.training_utils import CheckPointer, Trainer


In [None]:
environment = get_environment()
print(f"environment is {environment.name}")

environment is paperspace


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device is {device}")

device is cuda


In [None]:
ts = TinyShakespeareDataSet(environment.code_root / 'nbs/artifacts/input.txt')
tokenizer = CharacterTokenizer(ts.text)

In [None]:
train_data, val_data = split_text_dataset(ts.text, tokenizer, train_pct=0.9, device=device)

In [None]:
get_batch_func = partial(
    get_batch,
    batch_size=batch_size,
    block_size=block_size,
    train_data=train_data,
    val_data=val_data,
    device=device,
)
estimate_loss_func = partial(
    estimate_loss, eval_iters=eval_iters, get_batch_func=get_batch_func
)

In [None]:
experiment_dir = environment.data_root / 'alternate-models'
experiment_dir.mkdir(exist_ok=True)

In [None]:
training_root = experiment_dir / 'model-training' / f'{datetime.now().strftime("%Y%m%d")}-training'
training_root.mkdir(exist_ok=True, parents=True)

In [None]:
checkpoint_dir = training_root / 'training_checkpoints'
checkpoint_dir.mkdir(exist_ok=True, parents=True)

In [None]:
outputs_dir = training_root / 'outputs'
outputs_dir.mkdir(exist_ok=True, parents=True)

## Training Model 1

In [None]:
iteration = 1

In [None]:
torch.manual_seed(1442)
m = TransformerLanguageModel(vocab_size=tokenizer.vocab_size, device=device)

In [None]:
_ = m.to(device)

In [None]:
trainer = Trainer(
    model=m,
    checkpointer=CheckPointer(checkpoint_dir, f'shakespeare_{iteration}_checkpoint'),
    get_batch_func=get_batch_func,
    estimate_loss_func=estimate_loss_func,
    iters_trained=0,
)

In [None]:
# Get a starting point
estimate_loss_func(m)

{'train': tensor(4.2780), 'val': tensor(4.2824)}

In [None]:
# Start with a modest learning rate and train 5000 iterations
learning_rate = 3e-4
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
trainer.train(3500, optimizer, eval_interval=eval_interval)

  0%|          | 0/3500 [00:00<?, ?it/s]

step 499: train loss 1.7921, val loss 1.9340
step 999: train loss 1.4039, val loss 1.6217
step 1499: train loss 1.2720, val loss 1.5374
step 1999: train loss 1.1952, val loss 1.5002
step 2499: train loss 1.1385, val loss 1.4993
step 2999: train loss 1.0794, val loss 1.4871
step 3499: train loss 1.0229, val loss 1.4950


In [None]:
# Reduce learning rate and see if we can improve without overfitting.
learning_rate = 3e-5
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
trainer.train(3500, optimizer, eval_interval=eval_interval)

  0%|          | 0/3500 [00:00<?, ?it/s]

step 499: train loss 0.9652, val loss 1.4848
step 999: train loss 0.9445, val loss 1.4929
step 1499: train loss 0.9293, val loss 1.5038
step 1999: train loss 0.9129, val loss 1.5034
step 2499: train loss 0.8994, val loss 1.5133
step 2999: train loss 0.8875, val loss 1.5220
step 3499: train loss 0.8733, val loss 1.5258


In [None]:
# Pick the checkpoint with losses closest to our target
checkpoint = torch.load(checkpoint_dir / f'shakespeare_{iteration}_checkpoint_000009.pt', map_location=torch.device('cpu'))
checkpoint['iters'], checkpoint['train_loss'], checkpoint['val_loss']

(5000, tensor(0.9293), tensor(1.5038))

In [None]:
# Save checkpoint
target_filename = outputs_dir / f'shakespeare-{datetime.now().strftime("%Y%m%d")}-{iteration}.pt'
torch.save(checkpoint['model_state_dict'], target_filename)
print(f"Saved checkpoint to {target_filename}")

Saved checkpoint to /storage/alternate-models/model-training/20240112-training/outputs/shakespeare-20240112-1.pt


## Training Model 2

In [None]:
iteration = 2

In [None]:
torch.manual_seed(88)
m = TransformerLanguageModel(vocab_size=tokenizer.vocab_size, device=device)

In [None]:
_ = m.to(device)

In [None]:
trainer = Trainer(
    model=m,
    checkpointer=CheckPointer(checkpoint_dir, f'shakespeare_{iteration}_checkpoint'),
    get_batch_func=get_batch_func,
    estimate_loss_func=estimate_loss_func,
    iters_trained=0,
)

In [None]:
# Get a starting point
estimate_loss_func(m)

{'train': tensor(4.3503), 'val': tensor(4.3544)}

In [None]:
# Start with a modest learning rate and train 5000 iterations
learning_rate = 3e-4
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
trainer.train(3500, optimizer, eval_interval=eval_interval)

  0%|          | 0/3500 [00:00<?, ?it/s]

step 499: train loss 1.7591, val loss 1.9024
step 999: train loss 1.3939, val loss 1.6083
step 1499: train loss 1.2698, val loss 1.5196
step 1999: train loss 1.1861, val loss 1.4866
step 2499: train loss 1.1303, val loss 1.4812
step 2999: train loss 1.0706, val loss 1.4865
step 3499: train loss 1.0152, val loss 1.4986


In [None]:
# Reduce learning rate and see if we can improve without overfitting.
learning_rate = 3e-5
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
trainer.train(3500, optimizer, eval_interval=eval_interval)

  0%|          | 0/3500 [00:00<?, ?it/s]

step 499: train loss 0.9525, val loss 1.4957
step 999: train loss 0.9294, val loss 1.4991
step 1499: train loss 0.9119, val loss 1.5099
step 1999: train loss 0.8985, val loss 1.5203
step 2499: train loss 0.8825, val loss 1.5316
step 2999: train loss 0.8707, val loss 1.5292
step 3499: train loss 0.8560, val loss 1.5462


In [None]:
# Pick the checkpoint with losses closest to our target
checkpoint = torch.load(checkpoint_dir / f'shakespeare_{iteration}_checkpoint_000008.pt', map_location=torch.device('cpu'))
checkpoint['iters'], checkpoint['train_loss'], checkpoint['val_loss']

(4500, tensor(0.9294), tensor(1.4991))

In [None]:
# Save checkpoint
target_filename = outputs_dir / f'shakespeare-{datetime.now().strftime("%Y%m%d")}-{iteration}.pt'
torch.save(checkpoint['model_state_dict'], target_filename)
print(f"Saved checkpoint to {target_filename}")

Saved checkpoint to /storage/alternate-models/model-training/20240112-training/outputs/shakespeare-20240112-2.pt


## Training Model 3

In [None]:
iteration = 3

In [None]:
torch.manual_seed(99999)
m = TransformerLanguageModel(vocab_size=tokenizer.vocab_size, device=device)

In [None]:
_ = m.to(device)

In [None]:
trainer = Trainer(
    model=m,
    checkpointer=CheckPointer(checkpoint_dir, f'shakespeare_{iteration}_checkpoint'),
    get_batch_func=get_batch_func,
    estimate_loss_func=estimate_loss_func,
    iters_trained=0,
)

In [None]:
# Get a starting point
estimate_loss_func(m)

{'train': tensor(4.1027), 'val': tensor(4.1104)}

In [None]:
# Start with a modest learning rate and train 5000 iterations
learning_rate = 3e-4
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
trainer.train(3500, optimizer, eval_interval=eval_interval)

  0%|          | 0/3500 [00:00<?, ?it/s]

step 499: train loss 1.7534, val loss 1.9022
step 999: train loss 1.3909, val loss 1.6158
step 1499: train loss 1.2663, val loss 1.5401
step 1999: train loss 1.1896, val loss 1.5024
step 2499: train loss 1.1243, val loss 1.4805
step 2999: train loss 1.0696, val loss 1.4846
step 3499: train loss 1.0183, val loss 1.4951


In [None]:
# Reduce learning rate and see if we can improve without overfitting.
learning_rate = 3e-5
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
trainer.train(3500, optimizer, eval_interval=eval_interval)

  0%|          | 0/3500 [00:00<?, ?it/s]

step 499: train loss 0.9545, val loss 1.4912
step 999: train loss 0.9339, val loss 1.4941
step 1499: train loss 0.9162, val loss 1.5063
step 1999: train loss 0.9015, val loss 1.5131
step 2499: train loss 0.8879, val loss 1.5214
step 2999: train loss 0.8756, val loss 1.5251
step 3499: train loss 0.8614, val loss 1.5348


In [None]:
# Pick the checkpoint with losses closest to our target
checkpoint = torch.load(checkpoint_dir / f'shakespeare_{iteration}_checkpoint_000008.pt', map_location=torch.device('cpu'))
checkpoint['iters'], checkpoint['train_loss'], checkpoint['val_loss']

(4500, tensor(0.9339), tensor(1.4941))

In [None]:
# Save checkpoint
target_filename = outputs_dir / f'shakespeare-{datetime.now().strftime("%Y%m%d")}-{iteration}.pt'
torch.save(checkpoint['model_state_dict'], target_filename)
print(f"Saved checkpoint to {target_filename}")

Saved checkpoint to /storage/alternate-models/model-training/20240112-training/outputs/shakespeare-20240112-3.pt
