In [None]:
import asyncio
import array, struct
from math import log, sin, cos, tan, exp, sqrt, pi
from time import time, sleep
from random import randrange
import torch
import numpy as np
from testbed import UTF8Dataset, MLPLM, TransformerLM, AdamW, Learner, MagicList, StatsTicker

## Scheduling helpers

In [None]:
constant = lambda c: lambda step: c
linear_warmup_then_decay = lambda lr, warmup: lambda n: lr*(n/warmup) if n < warmup else lr*(warmup/n)

## MLP Language Model

In [None]:
model = MLPLM(n_vocab_in=256, n_vocab_out=256, n_ctx=64,
              d_model=64, d_hidden=8192, nonlinearity="GELU").to('cuda')

optimizer = AdamW(parameters=model.parameters(), eps=constant(1e-4), 
                  lr=linear_warmup_then_decay(lr=5e-5,warmup=256), 
                  beta1=constant(0.9), beta2=constant(0.999),
                  weight_decay=constant(0.01), initial_step=0)

dataset = UTF8Dataset()

config = {"model": model, "optimizer": optimizer, "dataset": dataset}

In [None]:
learner = Learner(config=config)

In [None]:
metrics = MagicList()

In [None]:
async def train(n_steps, batch_size, example_length, metrics):
    try:
        for step in range(n_steps):
            await asyncio.sleep(.01)
            loss = np.sum(learner.step(batch_size, example_length))/batch_size
            metrics.append(loss)
    except Exception as e:
        return e
    return None

In [None]:
asyncio.create_task(train(2**20, 256, 65, metrics))

In [None]:
#StatsTicker(metrics)

## Transformer Language Model

In [None]:
model = TransformerLM(n_vocab_in=256, n_vocab_out=256, max_ctx=128, d_model=256,
                      d_k=16, d_v=16, n_heads=16, d_hidden=256, n_layers=8, p_dropout_embedding=0.0,
                      p_dropout_attn_mat=0.0, p_dropout_attn_out=0.0, p_dropout_mlp=0.0).to('cuda')

optimizer = AdamW(parameters=model.parameters(), eps=constant(1e-4),
                  lr=linear_warmup_then_decay(lr=1e-4,warmup=10000), 
                  beta1=constant(0.9), beta2=constant(0.999), weight_decay=constant(0.01),
                  initial_step=0)

dataset = UTF8Dataset()

config = {"model": model, "optimizer": optimizer, "dataset": dataset}

In [None]:
learner = Learner(config=config)

In [None]:
learner.step(256,33)

## Plotting

In [None]:
ticker = StatsTicker(trainer,  x='step', y='mean_loss')
ticker

In [None]:
trainer.update("optimizer", lr=1e-03)

In [None]:
trainer.update("dataset", batch_size=8192)

In [None]:
result = ""


In [None]:
more = ''.join(list(trainer.autocomplete(result[:128],n_generate=256, max_ctx=128)))
print(more)

In [None]:
with open('gibberish.txt', 'w') as outfile:
    outfile.write(result)

In [None]:
async def foo():
    global result
    more = ''.join(list(trainer.autocomplete(n_generate=256, max_ctx=128)))
    result += more
    with open('gibberish.txt', 'a') as outfile:
        outfile.write(more)
    return result

In [None]:
for _ in range(2400):
    sleep(15)
    t = asyncio.create_task(foo())
    await t

In [None]:
trainer.metrics[-1]

In [None]:
trainer.save("checkpoint.pt")

In [None]:
trainer.load('checkpoint.pt')

### SmoothPlot

In [None]:
import scipy.ndimage

def smoother(X, Y, lag):
    Y = np.cumsum(Y)
    return X[lag:], (Y[lag:] - Y[:-lag])/lag

def gsmoother(X, Y, lag):
    X = X[lag:-lag]
    Y = scipy.ndimage.gaussian_filter1d(Y, sigma=lag)[lag:-lag]
    return (X, Y)

class SmoothPlot(LinePlot):
    def __init__(self, trainer, lag=100, log=None):
        L = np.array([[x['step'],x['mean_loss']] for x in trainer.metrics])
        n = len(L[:,0])
        k = n//1000 + 1
        X = L[:,0]
        Y = L[:,1]
        X,Y = gsmoother(X, Y, lag)
        X = X[::k]
        Y = Y[::k]
        if log:
            X = np.log(X)/math.log(2)
        super().__init__(X, Y)

class GaussianSmoothedLossRate(LinePlot):
    def __init__(self, trainer, lag=100, log=None):
        L = np.array([[x['step'],x['mean_loss']] for x in trainer.metrics])
        X = L[1:,0]
        Y = -L[1:,1] + L[:-1,1]
        X,Y = gsmoother(X, Y, lag)
        if log:
            X = np.log(X)/math.log(2)
        super().__init__(X, Y)

In [None]:
SmoothPlot(trainer, lag=10, log=False)

In [None]:
GaussianSmoothedLossRate(trainer, lag=10000, log=False)