Improve testbed module:

1. Massive text datasets
2. Conserve GPU memory
3. Save/Restore optimizer state
4. Dynamic Batching
5. Advanced Tokenization (multiple tokens at each spot; for embedding, add embedding for each token)
6. Optimizer enhancements such as scheduling
7. Zeno methods for RNNs: for $\log_2 n$ times as long, use only $\log_2 n$ copies of parameters instead of $n$.
8. GAN discriminators and back-offed generation. Sleep training?
9. Loss function enhancements: predicting simpler language models
10. Zeroth order methods and other oddball optimization techniques
11. Curriculum selection

Other stuff.
1. I want to be able to plot quickly without having to google anything, please! CHECK 
2. I need TextDataset to be able to do the batching and bypass the DataLoader feature. Mainly to be able to tune batch size beyond memory constraints in a smooth manner. Also because I dream of dynamic batching.
3. I need to figure out the cause of these artifacts I'm seeing. I suspect huge changes of batch size are unhealthy. I do not know how to predict the effect.

## 1. Massive text datasets

Ideas?

* `torch.Tensor.data_ptr` gives location in memory, maybe we can allocate a torch tensor and just plop it in there
* `torch.as_tensor` will convert from numpy to torch without copying

In [32]:
import torch
import numpy as np
from testbed import decode_broken_utf8, TextDataset
from pathlib import Path
from random import randrange
import time


In [33]:
path = '/home/sharker/data/corpus.txt'
s = Path(path).stat()
N = s.st_size
print(N)

14738357317


In [34]:
def benchmark_loading(L=4096, B=1024):
    x = bytes([])
    start = time.time()
    with open(path, 'rb') as infile:
        for i in range(0,B):
            idx = randrange(N - L)
            infile.seek(idx)
            x += infile.read(L)
    stop = time.time()
    print(stop - start, (L*B)/(stop-start), len(x))

In [35]:
def benchmark_loading2(L=4096, B = 1024):
    start = time.time()
    data = [torch.as_tensor(np.fromfile(path, dtype=np.ubyte, count=L,
            offset=randrange(N - L + 1)))  for i in range(B)]
    result = torch.cat(data)
    stop = time.time()
    #print(stop - start, (L*B)/(stop-start), result.shape)
    return result

In [36]:
benchmark_loading(8192, 1024)

0.32071495056152344 26155961.813793883 8388608


In [37]:
list(range(0,13,2))

[0, 2, 4, 6, 8, 10, 12]

In [38]:
dataset = TextDataset(N=2048)

In [39]:
dataset.N, len(dataset)

(2048, 7196463)

In [40]:
%%timeit
torch.cat([dataset[randrange(len(dataset))] for idx in range(8192)]).cuda()

181 ms ± 4.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [41]:
dataset.cache_data()

In [42]:
%%time
X = torch.cat([dataset[randrange(len(dataset))] for idx in range(8192)]).cuda()

CPU times: user 23.9 ms, sys: 0 ns, total: 23.9 ms
Wall time: 23 ms


In [None]:
DATA = dataset.data

In [None]:
DATA.share_memory_()

In [14]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from testbed import Net0, TextDataset

In [9]:
model = Net0(L=64)
dataset = TextDataset(N=65)
dataset.cache_data()
optimizer = AdamW(model.parameters())

In [19]:
for (i, batch) in enumerate(DataLoader(dataset, batch_size = 16)):
    loss = model(batch.to(torch.int64))
    loss.backward()
    optimizer.step()
    print(optimizer.state_dict())
    if i == 10:
        break

{'state': {0: {'step': 1, 'exp_avg': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'exp_avg_sq': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])}, 1: {'step': 1, 'exp_avg': tensor([[ 1.6745e-04,  1.3037e-04,  4.8145e-05,  ...,  2.3665e-04,
          6.4546e-05,  7.3498e-05],
        [ 3.1528e-04,  1.3821e-05,  1.4652e-04,  ...,  1.8896e-04,
          9.8135e-07,  8.5560e-05],
        [ 5.0684e-05,  6.5906e-05,  2.4662e-04,  ...,  2.1855e-04,
          4.0979e-04,  3.0247e-05],
        ...,
        [-6.9450e-05,  1.6152e-05,  1.7153e-04,  ...,  8.4404e-05,
          1.2713e-04, -

In [20]:
optimizer.state_dict()

{'state': {0: {'step': 11,
   'exp_avg': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ...,
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.]]),
   'exp_avg_sq': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           ...,
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.],
           [0., 0., 0.,  ..., 0., 0., 0.]])},
  1: {'step': 11,
   'exp_avg': tensor([[ 2.5336e-03, -3.4733e-03,  1.5329e-03,  ...,  1.0404e-03,
            -2.7317e-03, -1.4770e-03],
           [ 4.4034e-03, -8.5484e-04, -2.7823e-03,  ...,  7.5514e-05,
            -3.0193e-03, -4.2480e-04],
           [ 2.2727e-03, -2.3488e-03, -1.2146e-03,  ...,  2.4928e-03,
            -1.6664e-03, -6.0394e-03],
           ...,
           [-3.5695e-03

In [None]:
help(X.share_memory_)

In [43]:
from torch.utils.data import DataLoader

In [45]:
%%time
start = time.time()
for (i, x) in enumerate(DataLoader(dataset, batch_size=1024, shuffle=True)):
    if i == 0:
        true_start = time.time()
    print(x.device)
    #print(time.time() - true_start, (time.time() - true_start)/ (i+1), time.time() - start, i, x.shape)
    if i == 100:
        break

cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
cpu
CPU times: user 1.33 s, sys: 82.3 ms, total: 1.41 s
Wall time: 787 ms


In [None]:
%%timeit
x = benchmark_loading2(2**15, 1024)

In [None]:
%%timeit
y = x.cuda()

In [None]:
%%time
L = 8192
with open(path, 'rb') as infile:
    for i in range(0,1024):
        idx = randrange(N - L)
        infile.seek(idx)
        x = infile.read(L)

In [None]:
%%time
npx = np.frombuffer(x, dtype=np.uint8)

In [None]:
npx.shape

In [None]:
%%time
X = torch.tensor(npx, dtype=torch.uint8)

In [None]:
X.shape

## 2. Memory Conservation

In [22]:
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

0
0


## 7. Zeno Methods

What is the analysis?


In [None]:
from functools import lru_cache
from math import sqrt, ceil

@lru_cache(maxsize=None)
def C(n, k):
    if n == 0:
        return 0
    if n == 1:
        return 2
    w = [ round(i*(n/k)) for i in range(0, k+1) ]
    cost = 0
    for i in range(1, len(w)):
        if i < len(w) - 1:
            cost += w[i] - w[i-1]
        cost += D(w[i] - w[i-1], k)
    return cost

In [None]:
from testbed import Plot

In [None]:
Plot([C(n,2) for n in range(50000)])

Something silly:

In [None]:
from math import cos, sin
X = [ (cos(i/1000)*i*(1-sin(i/100)/2), sin(i/1000)*i*(1+cos(i/100)/2)) for i in range(100000) ]
Plot(X)