# Bilinear attention formulation test

In [1]:
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter
import random
import argparse
random.seed(0)

import dataset
import model
import trainer
import utils

# Save the device
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

python src/run.py pretrain vanilla wiki.txt --writing_params_path vanilla.pretrain.params

In [5]:
tb_expt_name = 'ipynb'
function = 'pretrain'
variant = 'vanilla'
pretrain_corpus_path = '../wiki.txt'
bottleneck_dim = 32
pretrain_lr = 6e-3
finetune_lr = 6e-4

# TensorBoard training log
writer = SummaryWriter(log_dir='expt/%s/%s_%s_%d_pt_lr_%f_ft_lr_%f' % (
    function,
    tb_expt_name,
    variant,
    bottleneck_dim,
    pretrain_lr,
    finetune_lr))

# Keep the block size 128
# Why is the pretraining corpus always required (even if we're not pretraining?)
# It's because we're using it as a hack to always have the same vocabulary
# (that is, the same mapping from character to integer, and we build the
# vocab from the pretraining corpus.)
block_size = 128
text = open(pretrain_corpus_path, encoding='utf-8').read()
pretrain_dataset = dataset.CharCorruptionDataset(text, block_size)

# We don't suggest you change these hyperparameters, as they're known to work.
# use them for both the vanilla and the perceiver models
mconf = model.GPTConfig(pretrain_dataset.vocab_size, pretrain_dataset.block_size,
    n_layer=4, n_head=8, n_embd=256)

data has 418352 characters, 256 unique.


In [6]:
# construct a GPT model
model = model.GPT(mconf)
model.to(device)

print('Model on device: ', next(model.parameters()).device)

number of parameters: 3323392
Model on device:  cpu


Vanilla, number of parameters: 3323392

python src/run.py pretrain bilinear wiki.txt --writing_params_path bilinear.pretrain.params

In [2]:
tb_expt_name = 'ipynb'
function = 'pretrain'
variant = 'bilinear'
pretrain_corpus_path = '../wiki.txt'
bottleneck_dim = 32
pretrain_lr = 6e-3
finetune_lr = 6e-4

# TensorBoard training log
writer = SummaryWriter(log_dir='expt/%s/%s_%s_%d_pt_lr_%f_ft_lr_%f' % (
    function,
    tb_expt_name,
    variant,
    bottleneck_dim,
    pretrain_lr,
    finetune_lr))

# Keep the block size 128
# Why is the pretraining corpus always required (even if we're not pretraining?)
# It's because we're using it as a hack to always have the same vocabulary
# (that is, the same mapping from character to integer, and we build the
# vocab from the pretraining corpus.)
block_size = 128
text = open(pretrain_corpus_path, encoding='utf-8').read()
pretrain_dataset = dataset.CharCorruptionDataset(text, block_size)

# We don't suggest you change these hyperparameters, as they're known to work.
# use them for both the vanilla and the perceiver models
mconf = model.GPTConfig(pretrain_dataset.vocab_size, pretrain_dataset.block_size,
    n_layer=4, n_head=8, n_embd=256)

data has 418352 characters, 256 unique.


In [3]:
# construct a GPT model
mconf.bilinear = True
model = model.GPT(mconf)
model.to(device)

print('Model on device: ', next(model.parameters()).device)

number of parameters: 3060224
Model on device:  cpu


Bilinear, number of parameters: 3060224
Which is 8% less than vanilla model

In [6]:
assert pretrain_corpus_path is not None
# Open the corpus
pretrain_text = open(pretrain_corpus_path, encoding='utf-8').read()
# Create a dataset
pretrain_dataset = dataset.CharCorruptionDataset(pretrain_text, block_size)
# Initialize the trainer
tconf = trainer.TrainerConfig(max_epochs=650,
                                batch_size=128,
                                learning_rate=pretrain_lr,
                                lr_decay=True,
                                warmup_tokens=512*20,
                                final_tokens=200*len(pretrain_dataset)*block_size,
                                num_workers=0,
                                writer=writer)
trainer = trainer.Trainer(model, pretrain_dataset, None, tconf)

data has 418352 characters, 256 unique.


In [None]:
# Train the model
trainer.train()

# Save the resulting model in args.writing_params_path
writing_params_path = 'bilinear.pretrain.params'
torch.save(model.state_dict(), writing_params_path)