## import libraries

In [None]:
import asyncio
import math
from math import log, sin, cos, tan, exp, sqrt, pi
import time
from random import randrange
import torch
import numpy as np
from classroom import Student
from classroom import GutenbergGPT2Dataset, GutenbergBytesDataset
from classroom import MLPLM, MyLM, ABPCNLM
from classroom import TransformerLM
from classroom import AdamW
from classroom import numel
from classroom import utf8decode, utf8encode, gpt2decode, gpt2encode
from classroom import utf8bitsdecode, utf8bitsencode
from pathlib import Path
import numba

## initialize model

In [None]:
if True:
    path = '2021-12-09-1100.pt'

In [None]:
if True:
    model = torch.load(path).to('cuda')

In [None]:
if True:
    model = (
        MLPLM(
            n_vocab_in=256,
            n_ctx=4096,
            d_model=2,
            d_hidden=[8192, 4096, 4096],
            nonlinearity="GELU",
            n_vocab_out=256).to('cuda'))

In [None]:
if False:
    model = (
        MyLM(
            n_vocab_in=50257,
            n_ctx=65,
            d_model=64,
            n_layers=1,
            d_hidden=256,
            nonlinearity="GELU",
            p_dropout=0.00,
            n_vocab_out=50257).to('cuda'))

In [None]:
if False:
    model = (
        ABPCNLM(
            n_vocab_in=256,
            n_ctx=4096,
            d_model=2,
            n_layers=1,
            d_hidden=2048,
            nonlinearity="GELU",
            p_dropout=0.0,
            n_vocab_out=256).to('cuda'))
    #batch_size = 1

In [None]:
if False:
    model = (
        TransformerLM(
            n_vocab_in=50257,
            n_vocab_out=50257,
            n_ctx=1024,
            d_model=256,
            d_k=16,
            d_v=16,
            n_heads=16,
            d_hidden=512,
            n_layers=3,
            p_dropout_embedding=0,
            p_dropout_attn_mat=0,
            p_dropout_attn_out=0,
            p_dropout_mlp=0).to('cuda'))

In [None]:
numel(model)

## initialize student

In [None]:
optimizer = AdamW(parameters=model.named_parameters())
dataset = GutenbergBytesDataset()
batch_size = None
example_length = model.n_ctx + 1

student= Student(
    model=model,
    optimizer=optimizer,
    dataset=dataset,
    batch_size=batch_size,
    example_length=example_length)

## schedule hyperparameters

In [None]:
student.batch_size = 1024
student.example_length = 4097
for (idx, (pn, p)) in enumerate(student.model.named_parameters()):
    batch_multiplier = 10
    lr_base = 1e-6
    warm_up = 0
    lr = lambda n: 0 if n < warm_up else lr_base *(1 + (n%100))/100 * (1.0 + 9.0*cos(n*3.14159/1000)**2)
    student.optimizer.state[pn]["lr"]           = lambda n: lr(n)
    student.optimizer.state[pn]["beta1"]        = lambda n: 0.9
    student.optimizer.state[pn]["beta2"]        = lambda n: 0.999
    student.optimizer.state[pn]["weight_decay"] = lambda n: 0.001
    student.optimizer.state[pn]["update"]       = lambda n: (n < warm_up) or (n%batch_multiplier == 0)

## test a single iteration

In [None]:
# model.language_model.crossentropyloss.crossentropyloss = torch.nn.CrossEntropyLoss(reduction='none')

In [None]:
# student.study()

## initialize baseline

In [None]:
if True:
    student.reset_baseline()
    n_of_last_baseline = len(student.times)
    t_start = time.time()
    t_of_last_baseline = 0

## start training

In [None]:
import asyncio
async def train(student):
    while True:
        student.study()
        await asyncio.sleep(1e-4)

In [None]:
training_task = asyncio.create_task(train(student))

In [None]:
training_task

## autocomplete

In [None]:
def autocomplete(model, prompt=None, n_generate=512,
                     n_ctx=None, temp=1.0,
                     encode=None, decode=None, output=None):
    Categorical = torch.distributions.Categorical
    if n_ctx is None:
        n_ctx = model.n_ctx
    if encode is None:
        encode = utf8encode # gpt2encode
    if decode is None:
        decode = utf8decode # gpt2decode
    if prompt is None:
        prompt = decode(student.dataset.batch(1, 2*n_ctx, offset=None).tolist()[0])  # kludge
    x = encode(prompt)
    x = x[-n_ctx:]
    prompt = decode(x)
    print(f"=== Prompt ===\n{prompt}\n=== Autocompletion ===\n")

    def sampler(x):
        x = list(x)
        for _ in range(n_generate):
            probs = model.inference(torch.tensor(x, dtype=torch.long, device="cuda").unsqueeze(0)).view(-1)[-model.n_vocab_out:]
            if temp > 0:
                y = Categorical(probs=probs**(1.0/temp)).sample().item()
            else:
                y = torch.argmax(probs).item()
            x = (x + [y])[-n_ctx:]
            if output is not None:
                output.append(y)
            yield y
    result = decode(list(sampler(x)))
    print(result)


When first the opposition of fact and ideal grows fully visible, a
spirit of fiery revolt, of fierce hatred of the gods, seems necessary
to the assertion of freedom. To defy with Promethean constancy a hostile
universe, to keep its evil always in view, always actively hated, to
refuse no pain that the malice of Power can invent, appears to be the
duty of all who will not bow before the inevitable. But indignation is
still a bondage, for it compels our thoughts to be occupied with an evil
world; and in the fierceness of desire from which rebellion springs there
is a kind of self-assertion which it is necessary for the wise to overcome.
Indignation is a submission of our thoughts, but not of our desires; the
Stoic freedom in which wisdom consists is found in the submission of our
desires, but not of our thoughts. From the submission of our desires springs
the virtue of resignation; from the freedom of our thoughts springs the whole
world of art and philosophy, and the vision of beauty by which, at last, we
half reconquer the reluctant world.

When first the opposition of fact and ideal grows fully visible, a spirit
of fiery revolt, of fierce hatred of the gods, seems necessary to the
assertion of freedom. To defy with Promethean constancy a hostile universe,
to keep its evil always in view, always actively hated, to refuse no pain
that the malice of Power can invent, appears to be the duty of all who will
not bow before the inevitable. But indignation is still a bondage, for it
compels our thoughts to be occupied with an evil world; and in the fierceness
of desire from which rebellion springs there is a kind of self-assertion
which it is necessary for the wise to overcome. Indignation is a submission
of our thoughts, but not of our desires; the Stoic freedom in which wisdom
consists is found in the submission of our desires, but not of our thoughts.
From the submission of our desires springs the virtue of resignation; from
the freedom of our thoughts springs the whole world of art and philosophy,
and the vision of beauty by which, at last, we half reconquer the reluctant
world.
...

We are of the sun and the moon and the stars.
The power manifested in the mind of God is projected

In [None]:
%%time
autocomplete(model=student.model, temp=1.0, n_ctx=4096, n_generate=512)

## plots

## stats

In [None]:
import time

n = len(student.times)-1
t = time.time() - t_start
dn = n - n_of_last_baseline
dt = t - t_of_last_baseline

N = min(dn, 1000)
y = np.mean(np.array(student.grades[n-N:n]))
y0 = np.mean(np.array(student.baseline_grades[n-N:n]))
dy = (y - y0)

if False:
    lyles_constant = (9115131782/2)/14818489608 * log(50257)/log(256)
    utf8grade = lambda x: 1 - (1 - x)*lyles_constant
    bpc = (1-utf8grade(y))*8
else:
    bpc = (1-y)*8

message = '\n'.join([
    f"bpc                   = {int(bpc*1e6)/1e6}",
    f"batch_size            = {student.batch_size}",
    f"example_length        = {student.example_length}",
    f"100*y                 = {int(y*1e6)/1e4}",
    f"n                     = {n} steps",
    f"t                     = {int(t)} seconds",
    f"n_of_last_baseline    = {n_of_last_baseline} steps",
    f"t_of_last_baseline    = {int(t_of_last_baseline)} seconds",
    f"steps per second      = {dn/dt}",
    f"y0                    = {int(y0*1e6)/1e6}",
    f"dy                    = {int(dy*1e6)}e-06",
    f"dn                    = {dn}",
    f"dt                    = {dt}",
    f"dy/dn                 = {dy/dn}",
    f"dy/dt                 = {int(1e6 * dy/dt * 3600)}e-6 per hour",
    f"bpc rate              = {int(dy/dt * 8 * 3600 * 1e3)}e-3 per hour",
    f"time for 100%         = {(10.0)/(dy/dt)//36/1000} hours",
])
print(message)

In [None]:
if True:
    student.reset_baseline()
    n_of_last_baseline = len(student.times)-1
    t_of_last_baseline = time.time() - t_start

## save

In [None]:
torch.save(student.model, f=path)

In [None]:
import asyncio
async def autosave():
    while True:
        await asyncio.sleep(3600)
        torch.save(student.model, f='autosave.pt')
task = asyncio.create_task(autosave())


In [None]:
chars_per_token = 

In [None]:
lyles_constant = (9115131782/2)/14818489608 * log(50257)/log(256)
lyles_constant

In [None]:
utf8grade = lambda x: 1 - (1 - x)*lyles_constant
grade = .6245 # .49 # .7343
print(f"gpt2 grade = {grade}, utf8 grade = {utf8grade(grade)}, bpc = {(1-utf8grade(grade))*8}")

## parameter histograms

In [None]:
histograms = []
for (idx, (pn, p)) in enumerate(student.model.named_parameters()):
    with torch.no_grad():
        print(idx, pn, torch.sqrt(torch.var(p)).item())
        Y, X = np.histogram(p.detach().cpu().numpy(), bins=int(sqrt(torch.numel(p))), density=True)
        print(X.shape, Y.shape)
        histograms.append(Plot(**{f"hist-{idx}": (X.tolist(), Y.tolist())}))

In [None]:
histograms[4] # 3 7 9 13 15 21 43

## batch-level grade histogram

In [None]:
Y, X = np.histogram(student.grades[-5000:], bins=256, range=(0,1.0), density=True)
V, U = np.histogram(student.baseline_grades[-5000:], bins=256, range=(0,1.0), density=True)
Plot(**{f"grade-hist": (X, Y), "baseline": (U, V)})

In [None]:
model.n_ctx, model.d_model, model.d_hidden, model.n_layers

## example-level grade histogram

In [None]:
def get_graded_examples():
    result = []
    for batch_idx in range(16):
        print(f"batch_idx = {batch_idx}/256")
        x = student.dataset.batch(student.batch_size, student.example_length)
        print(f"orig {x.shape}")
        with torch.no_grad():
            y = student.model(x)
            y = 1.0 - y.cpu().numpy()
            result.append(y)
    data = np.concatenate(result, axis=0)
    result = data.tolist()
    return result

In [None]:
graded_examples = get_graded_examples()

In [None]:
len(graded_examples)

In [None]:
sum(x for x in graded_examples)/len(graded_examples)

In [None]:
R = (0, 1)
def XYFor(k):
    es = graded_examples
    bins = int(sqrt(len(es)))
    Y, X = np.histogram(es, bins=bins, range=R, density=True)
    return (X, Y)
Plot(**{f"examples-hist-{k}": XYFor(k) for k in [1]})

In [None]:
ord(' ')

In [None]:
np.mean(example_grades)

In [None]:
(1 - 0.7870894884999871)*8

In [None]:
(1 - 0.8)*8

In [None]:
(1 - 0.9)*8

In [None]:
x = np.array([[1,2],[3,4]],dtype=np.uint8)

In [None]:
np.unpackbits(x, axis=1)

In [None]:
14818489608/(9115131782/2)

In [None]:
2.0*3.25