In [None]:
import math
from math import log
import time
import torch
import numpy as np
from torch.optim import AdamW, SGD
from testbed import TextDataset, Trainer, Net0, Net1, Net2, Net3, Net4, Transformer
from testbed.optim import Sonny
from testbed.util import decode_broken_utf8, default_device, numel
from testbed.gui import Plot, StatsTicker, ParameterInspector, Histogram, SmoothPlot, LinePlot

In [None]:
network_name = "Transformer"

In [None]:
def cuda_memory():
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0) 
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved
    print(f"Total {t}. Reserved {r}. Allocated {a}. Free {f}.")
    return (f, a, r, t) # code smell?

def memory_allocated():
    return torch.cuda.memory_allocated(0)

In [None]:
if network_name == "Net0":
    num_input_classes= 256 # 256 possible UTF-8 bytes
    embedding_dim = 32 # Dimension of embedding space. An embedding layer has 256 points in this space.
    context_length = 32 # Number of sequential bytes visible to model (i.e. in the context)
    num_hidden = 8192 # Hyperparameter for neural network
    num_output_classes = 256 # 256 possible UTF-8 bytes
    model = Net0(num_input_classes=num_input_classes,
                 embedding_dim=embedding_dim,
                 context_length=context_length,
                 num_hidden=num_hidden,
                 num_output_classes=num_output_classes,
                 nonlinearity="sigmoid").to(default_device())
    example_length = context_length + 1
    dataset = TextDataset(example_length=example_length)
    batch_size = 512 # batch size (i.e. examples per batch)
    OptimizerType = Sonny
    optimizer_kwargs = {"eps": 1e-8, "weight_decay": 0.01}

In [None]:
if network_name == "Net3":
    embedding_dim = 2
    context_length = 32
    num_hidden1 = 64
    num_hidden2 = 64
    model = Net3(embedding_dim=embedding_dim,
                 context_length=context_length,
                 num_hidden1=num_hidden1,
                 num_hidden2=num_hidden2).to(default_device())
    example_length = context_length + 1
    dataset = TextDataset(example_length=example_length)
    batch_size = 512
    OptimizerType = Sonny
    optimizer_kwargs = {}

In [None]:
if network_name == "Net4":
    num_input_classes= 256 # 256 possible UTF-8 bytes
    embedding_dim = 128 # Dimension of embedding space. An embedding layer has 256 points in this space.
    context_length = 128 # Number of sequential bytes visible to model (i.e. in the context)
    num_hidden = 4096 # Hyperparameter for neural network
    num_output_classes = 256 # 256 possible UTF-8 bytes
    model = Net4(num_input_classes=num_input_classes,
                 embedding_dim=embedding_dim,
                 context_length=context_length,
                 num_hidden=num_hidden,
                 num_output_classes=num_output_classes,
                 nonlinearity="GELU").to(default_device())
    example_length = context_length + 1
    dataset = TextDataset(example_length=example_length)
    batch_size = 8192 # batch size (i.e. examples per batch)
    OptimizerType = Sonny
    optimizer_kwargs = {"eps": 1e-4, 
                        "lr": .0001, 
                        "beta1": .9, 
                        "beta2": .999,
                        "weight_decay": 0.0001}

In [None]:
if network_name == "Transformer":
    model = Transformer(
        n_vocab=256,
        max_ctx=512,
        d_model=64,
        n_heads=8,
        d_ff=2048,
        n_layers=6).to(default_device())
    example_length = model.max_ctx + 1
    dataset = TextDataset(example_length=example_length)
    batch_size = 300 # batch size (i.e. examples per batch)
    OptimizerType = Sonny
    optimizer_kwargs = {"eps": 1e-4, 
                        "lr": .0001, 
                        "beta1": .9, 
                        "beta2": .999,
                        "weight_decay": 0.0001}

In [None]:
trainer = Trainer(model=model, 
                  example_length=example_length, 
                  batch_size=batch_size,
                  OptimizerType=OptimizerType,
                  optimizer_kwargs=optimizer_kwargs)

In [None]:
numel(model), model.name()

In [None]:
trainer.start()

In [None]:
for (name, p) in model.named_parameters():
    print(name, p.device)

In [None]:
ticker = StatsTicker(trainer, kind='line')
ticker

In [None]:
StatsTicker(trainer, x='compute_time', y='compute_energy')

In [None]:
trainer.set_optimizer_settings(lr=.0001, beta1=.9, beta2=.999, batch_size=8192*32, weight_decay=0.0001)

In [None]:
trainer.save()

In [None]:
1788*512*128/248.94 * 20.0

In [None]:
for p in model.parameters():
    print(torch.min(p))

In [None]:
trainer.losses[-10:]

In [None]:
trainer.autocomplete()
pass

### SmoothPlot

In [None]:
L = np.array([[x['compute_time'],x['mean_loss']] for x in trainer.losses])
X = L[:,0]
Y = L[:,1]
def smoother(data, lag):
    cs = np.cumsum(data)
    return (cs[lag:] - cs[:-lag])/lag

class SmoothPlot(LinePlot):
    def __init__(self, X=None, Y=None, lag=100, log=None):
        if X is not None:
            if Y is None:
                Y = np.array(X)
                X = np.array(range(len(X)))
            else:
                X = np.array(X)
                Y = np.array(Y)
            X = X[lag:]
            Y = smoother(Y, lag)
            if log:
                X = np.log(X)/math.log(2)
        super().__init__(X, Y)
SmoothPlot(X, Y, lag=100, log=False)

In [None]:
24*3600

## Benchmarking

In [None]:
import torch
import torch.utils.benchmark as benchmark
from torch.nn import Module, Embedding, Linear, CrossEntropyLoss, Softmax

def speed_test(B=8192, m=4096, n=1024):
    L = Linear(n, m).to('cuda')
    x = torch.randn(B, n, device='cuda')
    t = benchmark.Timer(
        stmt='L(x)',
        globals={'L': L, 'x': x})
    T = t.timeit(10)
    print(f'Benchmark:  {T.median* 1e6:>5.1f} us, {B*m*n / (1E9*T.median)} GFLOPS')
    return T.median


In [None]:
speed_test()

In [None]:
import torch
import torch.utils.benchmark as benchmark
from torch.nn import Module, Embedding, Linear, CrossEntropyLoss, Softmax

def speed_test(B=8192, m=4096, n=1024):
    L = Linear(n, m).half().to('cuda')
    x = torch.randn(B, n, device='cuda', dtype=torch.float16)
    t = benchmark.Timer(
        stmt='L(x)',
        globals={'L': L, 'x': x})
    T = t.timeit(20)
    print(f'Benchmark:  {T.median* 1e6:>5.1f} us, {B*m*n / (1E9*T.median)} GFLOPS')
    return T.median

speed_test()


# tokenization

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer("Hello world")['input_ids']

In [None]:
tokenizer(" Hello world")['input_ids']

In [None]:
help(tokenizer)

In [None]:
len(tokenizer)

idea: compute polynomial activation functions using horners method with trainable parameter coefficients.

by stone or weierstrass or some other mad-dog mathematician i ought to remember, we know that polynomials converge uniformly to any desired function of sufficient regularity... I want to say L2, but then the concept of uniform has to be tweaked, perhaps uniformly almost everywhere, or for every delta we can exclude a set of measure delta and on what remains we have uniform convergence. 

but never mind these mathematical technicalities for once because we don't need to care, we just need to know the central idea is sound. It is. So we can do this. We don't have to join a religious group of GELU vs ReLU vs sigmoid or whatever. Let it choose itself.



In [None]:
for (name, p) in model.named_parameters():
    #if name == 'nonlinear.coefs':
    print(p.shape, p)
    print(torch.any(torch.isnan(p)))

In [None]:
from random import randrange
for (name, p) in model.named_parameters():
    if name == 'nonlinear.coefs':
        c = p.detach().cpu().numpy()  
        print(c)

In [None]:
h = .001
X = np.arange(h,1,h)
Y = sum( c[i] * X**i for i in range(len(c)))
LinePlot(X,Y)

In [None]:
h = .01
X = np.arange(-6.28+h,6.28,h)
Y = sum( c[i] * np.sin(X)**i for i in range(len(c)))
LinePlot(X,Y)

In [None]:
from random import randrange
n = randrange(p.shape[-1])
for (name, p) in model.named_parameters():
    if name == 'nonlinear.coefs':
        c = p[:,n].detach().cpu().numpy()  
        print(c.shape)

h = .001
X = np.arange(0,1+h,h)
Y = sum( c[i] * X**i for i in range(len(c)))
LinePlot(X,Y)
