In [1]:
import math
from math import log
import time
import torch
import numpy as np
from torch.optim import AdamW, SGD
from testbed import TextDataset, Trainer, Net0, Net1, Net2, Net3, Net4, Transformer
from testbed.optim import Sonny
from testbed.util import decode_broken_utf8, default_device, numel
from testbed.gui import Plot, StatsTicker, ParameterInspector, Histogram, SmoothPlot, LinePlot

In [2]:
def cuda_memory():
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0) 
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved
    print(f"Total {t}. Reserved {r}. Allocated {a}. Free {f}.")
    return (f, a, r, t) # code smell?

def memory_allocated():
    return torch.cuda.memory_allocated(0)

In [3]:
net0_example=False
if net0_example:
    num_input_classes= 256 # 256 possible UTF-8 bytes
    embedding_dim = 32 # Dimension of embedding space. An embedding layer has 256 points in this space.
    context_length = 128 # Number of sequential bytes visible to model (i.e. in the context)
    num_hidden = 512 # Hyperparameter for neural network
    num_output_classes = 256 # 256 possible UTF-8 bytes
    model = Net0(num_input_classes=num_input_classes,
                 embedding_dim=embedding_dim,
                 context_length=context_length,
                 num_hidden=num_hidden,
                 num_output_classes=num_output_classes).to(default_device())
    example_length = context_length + 1
    dataset = TextDataset(example_length=example_length)
    batch_size = 512 # batch size (i.e. examples per batch)
    OptimizerType = Sonny
    optimizer_kwargs = {"eps": 1e-4}

In [4]:
net3_example=True
if net3_example:
    embedding_dim = 64
    context_length = 128
    num_hidden1 = 8192
    num_hidden2 = 4096
    model = Net3(embedding_dim=embedding_dim,
                 context_length=context_length,
                 num_hidden1=num_hidden1,
                 num_hidden2=num_hidden2).to(default_device())
    example_length = context_length + 1
    dataset = TextDataset(example_length=example_length)
    batch_size = 512
    OptimizerType = Sonny
    optimizer_kwargs = {}

Loaded 1048576 bytes of training data.


In [5]:
net4_example=False
if net4_example:
    E=8
    L=1
    M=512
    H=2
    model = Net4(E=E,L=L,M=M,H=H).to(default_device())
    example_length = 64
    dataset = TextDataset(example_length=example_length)
    batch_size = 32
    OptimizerType = Sonny

In [6]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  398449 KB |  398449 KB |  398449 KB |       0 B  |
|       from large pool |  397312 KB |  397312 KB |  397312 KB |       0 B  |
|       from small pool |    1137 KB |    1137 KB |    1137 KB |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |  398449 KB |  398449 KB |  398449 KB |       0 B  |
|       from large pool |  397312 KB |  397312 KB |  397312 KB |       0 B  |
|       from small pool |    1137 KB |    1137 KB |    1137 KB |       0 B  |
|---------------------------------------------------------------

In [7]:
trainer = Trainer(model=model, 
                  example_length=example_length, 
                  batch_size=batch_size,
                  OptimizerType=OptimizerType,
                  optimizer_kwargs=optimizer_kwargs)

Loaded 1048576 bytes of training data.


In [8]:
numel(model), model.name()

(101740800, 'Net3(64,128,8192,4096)')

In [9]:
trainer.start()

In [10]:
ticker = StatsTicker(trainer, kind='line')
ticker



In [None]:
StatsTicker(trainer, x='compute_time', y='compute_energy')

In [None]:
trainer.set_optimizer_settings(lr=.0005)

In [22]:
trainer.set_batch_size(2048)

In [None]:
trainer.save()

In [14]:
1788*512*128/248.94 * 20.0

9414185.586888406

In [30]:
trainer.losses[-10:]

[{'step': 28590,
  'compute_time': 5626.298417568207,
  'compute_energy': 7658.217678961376,
  'mean_loss': 2.0588414669036865,
  'var_loss': 8.652761459350586,
  'num_examples': 2048},
 {'step': 28591,
  'compute_time': 5626.675753831863,
  'compute_energy': 7658.84267377328,
  'mean_loss': 2.1753153800964355,
  'var_loss': 9.61489486694336,
  'num_examples': 2048},
 {'step': 28592,
  'compute_time': 5627.055674791336,
  'compute_energy': 7659.467668585184,
  'mean_loss': 2.1457653045654297,
  'var_loss': 9.309741020202637,
  'num_examples': 2048},
 {'step': 28593,
  'compute_time': 5627.44410943985,
  'compute_energy': 7660.092663397088,
  'mean_loss': 2.0889229774475098,
  'var_loss': 9.033828735351562,
  'num_examples': 2048},
 {'step': 28594,
  'compute_time': 5627.8496124744415,
  'compute_energy': 7660.717658208991,
  'mean_loss': 2.1566338539123535,
  'var_loss': 10.147834777832031,
  'num_examples': 2048},
 {'step': 28595,
  'compute_time': 5628.222664117813,
  'compute_energy

In [34]:
trainer.autocomplete()
pass

as proclaimed in articulate words along with them.
The symbol and the accompanying voice of God in all other cases have
one an
~AUTOCOMPLETE~
d stated by the New York or though to see house to thicken varian more
places; cation which he seems, and would this breathemen. Else we
causeful I happened tormolook breakfast. I have a dulet men be pioved only.

This haunt them stample in a company for time door.

"The eyes first alone, shall see; then Newfore matter genetry, out of
practice arousance of his arm has just unwood.  I speak, or crowd yair,
to matter in order than he wrote water Tretchily period, when the tool
on a declinning to Parner-Befrails of Mrs.
Thick, orants when a grow is got and also smid of "Phadootehousand
     rou_ of stif _no, 9_ and stood is.
You jenely!
   "'[ix_ it by Temple? Georges and she die no crussian is Rechnimient, and I came and such with your Even the
nich as with a woman country-steamed that court flower whose dure; as he
when and amend 

### SmoothPlot

In [25]:
L = np.array([[x['compute_time'],x['mean_loss']] for x in trainer.losses])
X = L[:,0]
Y = L[:,1]
def smoother(data, lag):
    cs = np.cumsum(data)
    return (cs[lag:] - cs[:-lag])/lag

class SmoothPlot(LinePlot):
    def __init__(self, X=None, Y=None, lag=100, log=None):
        if X is not None:
            if Y is None:
                Y = np.array(X)
                X = np.array(range(len(X)))
            else:
                X = np.array(X)
                Y = np.array(Y)
            X = X[lag:]
            Y = smoother(Y, lag)
            if log:
                X = np.log(X)/math.log(2)
        super().__init__(X, Y)
SmoothPlot(X, Y, lag=10, log=False)



In [None]:
24*3600

## Benchmarking

In [None]:
import torch
import torch.utils.benchmark as benchmark
from torch.nn import Module, Embedding, Linear, CrossEntropyLoss, Softmax

def speed_test(B=8192, m=4096, n=1024):
    L = Linear(n, m).to('cuda')
    x = torch.randn(B, n, device='cuda')
    t = benchmark.Timer(
        stmt='L(x)',
        globals={'L': L, 'x': x})
    T = t.timeit(10)
    print(f'Benchmark:  {T.median* 1e6:>5.1f} us, {B*m*n / (1E9*T.median)} GFLOPS')
    return T.median


In [None]:
speed_test()

In [None]:
import torch
import torch.utils.benchmark as benchmark
from torch.nn import Module, Embedding, Linear, CrossEntropyLoss, Softmax

def speed_test(B=8192, m=4096, n=1024):
    L = Linear(n, m).half().to('cuda')
    x = torch.randn(B, n, device='cuda', dtype=torch.float16)
    t = benchmark.Timer(
        stmt='L(x)',
        globals={'L': L, 'x': x})
    T = t.timeit(20)
    print(f'Benchmark:  {T.median* 1e6:>5.1f} us, {B*m*n / (1E9*T.median)} GFLOPS')
    return T.median

speed_test()
