In [None]:
import math
from math import log
import time
import torch
import numpy as np
from torch.optim import AdamW
from testbed import TextDataset, Trainer, Net0, Net1, Net2, Net3, Net4, Transformer
from testbed.optim import Sonny
from testbed.util import decode_broken_utf8, default_device, numel
from testbed.gui import Plot, StatsTicker, ParameterInspector, Histogram, SmoothPlot, LinePlot

In [None]:
def cuda_memory():
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0) 
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved
    print(f"Total {t}. Reserved {r}. Allocated {a}. Free {f}.")
    return (f, a, r, t) # code smell?

def memory_allocated():
    return torch.cuda.memory_allocated(0)

In [None]:

net0_example=True
if net0_example:
    num_input_classes= 256 # 256 possible UTF-8 bytes
    embedding_dim = 64 # Dimension of embedding space. An embedding layer has 256 points in this space.
    context_length = 64 # Number of sequential bytes visible to model (i.e. in the context)
    num_hidden = 1024 # Hyperparameter for neural network
    num_output_classes = 256 # 256 possible UTF-8 bytes
    model = Net0(num_input_classes=num_input_classes,
                 embedding_dim=embedding_dim,
                 context_length=context_length,
                 num_hidden=num_hidden,
                 num_output_classes=num_output_classes).to(default_device())
    example_length = context_length + 1
    dataset = TextDataset(example_length=example_length)
    batch_size = 8192 # batch size (i.e. examples per batch)
    OptimizerType = Sonny
    

In [None]:
trainer.losses[-1]

In [None]:
12.708162306048019/8.323310613632202

In [None]:
cuda_memory()

In [None]:
memory_allocated()

In [None]:
numel(model)

In [None]:
net3_example=False
if net3_example:
    embedding_dim = 8
    context_length = 256
    num_hidden1 = 256
    num_hidden2 = 256
    model = Net3(embedding_dim=embedding_dim,
                 context_length=context_length,
                 num_hidden1=num_hidden1,
                 num_hidden2=num_hidden2).to(default_device())
    example_length = context_length + 1
    dataset = TextDataset(example_length=example_length)
    batch_size = 8192*1024 # batch size (i.e. examples per batch)
    OptimizerType = Sonny

In [None]:
net4_example=False
if net4_example:
    E=8
    L=1
    M=512
    H=2
    model = Net4(E=E,L=L,M=M,H=H).to(default_device())
    example_length = 64
    dataset = TextDataset(example_length=example_length)
    batch_size = 32 # batch size (i.e. examples per batch)
    OptimizerType = Sonny

In [None]:
numel(model), model.name()

In [None]:
trainer = Trainer(model=model, 
                  example_length=example_length, 
                  batch_size=batch_size, 
                  dataset=dataset, 
                  OptimizerType=OptimizerType)

In [None]:
# trainer.load()

In [None]:
if False:
    trainer = Trainer("net0_H8192_L32_E32_BPC1.87.pt")
    model = trainer.model

In [None]:
numel(trainer.model)

In [None]:
trainer.start()

In [None]:
ticker = StatsTicker(trainer, kind='line')
ticker

In [None]:
StatsTicker(trainer, x='compute_time', y='compute_energy')

In [None]:
13.5E12 / 25 / 1E9

In [None]:
trainer.set_optimizer_settings(lr=.001)

In [None]:
trainer.set_batch_size(8192*32)

In [None]:
trainer.save()

In [None]:
trainer.losses[-10:]

In [None]:
trainer.autocomplete()
pass

## Some Plots

## Plot

Just a simple plot function. A one liner.

In [None]:
L = np.array(trainer.losses)
X = L[:,0]
Y = 8*L[:,2]/log(256)
LinePlot(X,Y)

### SmoothPlot

In [None]:
L = np.array(trainer.losses)
X = L[:,1]
Y = 8*L[:,2]/log(256)
def smoother(data, lag):
    cs = np.cumsum(data)
    return (cs[lag:] - cs[:-lag])/lag

class SmoothPlot(LinePlot):
    def __init__(self, X=None, Y=None, lag=100, log=None):
        if X is not None:
            if Y is None:
                Y = np.array(X)
                X = np.array(range(len(X)))
            else:
                X = np.array(X)
                Y = np.array(Y)
            X = X[lag:]
            Y = smoother(Y, lag)
            if log:
                X = np.log(X)/math.log(2)
        super().__init__(X, Y)
SmoothPlot(X, Y, lag=1000, log=True)

In [None]:
24*3600

### Autocomplete for Net4 (more generally, RNN, needs to be improved)

In [None]:
def autocomplete(prompt="", output_length=32):
    prompt_bytes = bytes(prompt, encoding='utf-8')
    init_len = len(prompt_bytes)
    for _ in range(output_length):
        P = trainer.model.probs(prompt_bytes)
        #print(P.shape)
        prob_dist = torch.distributions.Categorical(torch.tensor(P[-1]))
        prompt_bytes = prompt_bytes + bytes([prob_dist.sample().item()])
    print(decode_broken_utf8(bytes(prompt, encoding='utf-8')+bytes("\n~AUTOCOMPLETE~\n",'utf-8') + prompt_bytes[init_len:]))

In [None]:
autocomplete("I imagine")

## Benchmarking

In [None]:
import torch
import torch.utils.benchmark as benchmark
from torch.nn import Module, Embedding, Linear, CrossEntropyLoss, Softmax

def speed_test(B=8192, m=4096, n=1024):
    L = Linear(n, m).to('cuda')
    x = torch.randn(B, n, device='cuda')
    t = benchmark.Timer(
        stmt='L(x)',
        globals={'L': L, 'x': x})
    T = t.timeit(10)
    print(f'Benchmark:  {T.median* 1e6:>5.1f} us, {B*m*n / (1E9*T.median)} GFLOPS')
    return T.median


In [None]:
speed_test()

In [None]:
import torch
import torch.utils.benchmark as benchmark
from torch.nn import Module, Embedding, Linear, CrossEntropyLoss, Softmax

def speed_test(B=8192, m=4096, n=1024):
    L = Linear(n, m).half().to('cuda')
    x = torch.randn(B, n, device='cuda', dtype=torch.float16)
    t = benchmark.Timer(
        stmt='L(x)',
        globals={'L': L, 'x': x})
    T = t.timeit(20)
    print(f'Benchmark:  {T.median* 1e6:>5.1f} us, {B*m*n / (1E9*T.median)} GFLOPS')
    return T.median

speed_test()
