In [None]:
import sys
sys.path.append("/home/sharker/github/scholar")

In [None]:
import asyncio
import math
from math import log, sin, cos, tan, exp, sqrt, pi
import time
from random import randrange
import torch
import numpy as np
from scholar.trainer import Trainer
from scholar.dataset import GutenbergGPT2Dataset
from scholar.model import TransformerLM
from scholar.optimizer import AdamW
from scholar.autocomplete import autocomplete
from scholar import numel

In [None]:
if True:
    path = '2021-12-09-2307.pt'

In [None]:
if True:
    model = torch.load(path).to('cuda')

In [None]:
if False:
    model = (
        MLPLM(
            n_vocab_in=256,
            n_ctx=1024,
            d_model=8,
            d_hidden=[8196],
            nonlinearity="GELU",
            n_vocab_out=256).to('cuda'))

In [None]:
if False:
    model = (
        MyLM(
            n_vocab_in=50257,
            n_ctx=65,
            d_model=64,
            n_layers=1,
            d_hidden=256,
            nonlinearity="GELU",
            p_dropout=0.00,
            n_vocab_out=50257).to('cuda'))

In [None]:
if False:
    model = (
        ABPCNLM(
            n_vocab_in=256,
            n_ctx=4096,
            d_model=2,
            n_layers=1,
            d_hidden=2048,
            nonlinearity="GELU",
            p_dropout=0.0,
            n_vocab_out=256).to('cuda'))
    #batch_size = 1

In [None]:
if True:
    model = (
        TransformerLM(
            n_vocab_in=50257,
            n_vocab_out=50257,
            n_ctx=1024,
            d_model=1024,
            d_k=32,
            d_v=32,
            n_heads=32,
            d_hidden=4096,
            n_layers=3).to('cuda'))

In [None]:
numel(model)

In [None]:
optimizer = AdamW(parameters=model.named_parameters())
dataset = GutenbergGPT2Dataset()
batch_size = None
example_length = model.n_ctx + 1

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    dataset=dataset,
    batch_size=batch_size,
    example_length=example_length)

In [None]:
trainer.batch_size = 1
trainer.example_length = 1025
for (idx, (pn, p)) in enumerate(trainer.model.named_parameters()):
    batch_multiplier = 100
    lr_base = 1e-5
    warm_up = 0
    lr = lambda n: 0 if n < warm_up else lr_base *(1 + (n%1000))/1000 # * (1.0 + 9.0*cos(n*3.14159/10000)**2)
    trainer.optimizer.state[pn]["lr"]           = lambda n: lr(n)
    trainer.optimizer.state[pn]["beta1"]        = lambda n: 0.9
    trainer.optimizer.state[pn]["beta2"]        = lambda n: 0.999
    trainer.optimizer.state[pn]["weight_decay"] = lambda n: 0.001
    trainer.optimizer.state[pn]["update"]       = lambda n: (n < warm_up) or (n%batch_multiplier == 0)

In [None]:
class Shaping:
    def __init__(self):
        self.alpha=0.5
    def __call__(self, batch, losses):
        x = torch.mean(losses[...,:-1])
        y = torch.mean(losses[...,-1])
        return (1-self.alpha)*x + self.alpha*y
    
shaping = Shaping()

async def train(trainer):
    trainer.losses = []
    while True:
        loss = trainer.step(shaping=shaping)
        trainer.losses.append(loss)
        await asyncio.sleep(1e-4)

In [None]:
training_task = asyncio.create_task(train(trainer))
t_start = time.time()

In [None]:
training_task

In [None]:
import time

n0 = trainer.n
t0 = time.time() - t_start

In [None]:
n = trainer.n
t = time.time() - t_start
dn = n - n0
dt = t - t0

N = max(10, dn//20*10)
print(f"N = {N}")
K = 0
L = np.mean(np.array(trainer.losses[n-N:n]))
L0 = np.mean(np.array(trainer.losses[n0+K:n0+K+N]))
dL = (L - L0)
if False:
    lyles_constant = 8 # utf8 version
else:
    lyles_constant = (9115131782/2)/14818489608 * log(50257)/log(256)*8

bpc = lyles_constant*L
rate = 2 * lyles_constant * -dL/dt * 14818489608 / 8
message = '\n'.join([
    f"bpc                  = {int(bpc*1e6)/1e6}",
    f"batch_size           = {trainer.batch_size}",
    f"example_length       = {trainer.example_length}",
    f"n                    = {n} steps",
    f"t                    = {int(t)} seconds",
    f"n0                   = {n0} steps",
    f"dn                   = {int(dn)} steps",
    f"dt                   = {int(dt)} seconds",
    f"dn/dt                = {(dn/dt)} steps per second",
    f"L                    = {int(L*1e6)/1e6}",
    f"L0                   = {int(L0*1e6)/1e6}",
    f"new bytes            = {int(dt*rate/2**20/2)}MiB",
    f"bytes left = {int(bpc/8*14818489608/2**20)}MiB",
    f"progress {int((8-bpc)/8 *14818489608//1E6)/1000}E9/14.818E9",
    f"learning rate: {int(rate/1024)} KiBps, {int(rate*3600/2**20)} MiBph",
    f"feeding rate: {int(trainer.batch_size*trainer.example_length*dn/dt*2)} Bps"
])
print(message)

In [None]:
prompt = """When first the opposition of fact and ideal grows fully visible, a spirit
of fiery revolt, of fierce hatred of the gods, seems necessary to the
assertion of freedom. To defy with Promethean constancy a hostile universe,
to keep its evil always in view, always actively hated, to refuse no pain
that the malice of Power can invent, appears to be the duty of all who will
not bow before the inevitable. But indignation is still a bondage, for it
compels our thoughts to be occupied with an evil world; and in the fierceness
of desire from which rebellion springs there is a kind of self-assertion
which it is necessary for the wise to overcome. Indignation is a submission
of our thoughts, but not of our desires; the Stoic freedom in which wisdom
consists is found in the submission of our desires, but not of our thoughts.
From the submission of our desires springs the virtue of resignation; from
the freedom of our thoughts springs the whole world of art and philosophy,
and the vision of beauty by which, at last, we half reconquer the reluctant
world."""

In [None]:
%%time
response = autocomplete(
    prompt=prompt,
    model=model,
    encode=dataset.encode,
    decode=dataset.decode,
    n_ctx=128,
    temp=1.0,
    device="cuda")
print(response)

In [None]:
torch.save(trainer.model, f=path)