## import libraries

In [1]:
import asyncio
import math
from math import log, sin, cos, tan, exp, sqrt, pi
import time
from random import randrange
import torch
import numpy as np
from classroom import Classroom
from classroom import Student
from classroom import BytesDataset
from classroom import GutenbergSnippetsDataset
from classroom import GutenbergBitsDataset
from classroom import GutenbergGPT2Dataset
from classroom import MLPLM, MyLM, ABPCNLM
from classroom import TransformerLM
from classroom import AdamW
from classroom import Sonny
from classroom import Floyd
from classroom import Plot
from classroom import Fun
from classroom import Count
from classroom import Sum
from classroom import Diff
from classroom import Log2Sum
from classroom import KalmanFilter1D
from classroom import MedianFilter
from classroom import TwoWindowFilter
from classroom import numel
from classroom import utf8decode, utf8encode, gpt2decode, gpt2encode
from classroom import utf8bitsdecode, utf8bitsencode
from pathlib import Path
import numba

## initialize model

In [2]:
if True:
    path = '2021-10-18-1424.pt'

In [None]:
if True:
    model = torch.load(path).to('cuda')

In [6]:
if True:
    model = (
        ABPCNLM(
            n_vocab_in=50257,
            n_ctx=128,
            d_model=32,
            n_layers=4,
            d_hidden=1024,
            nonlinearity="GELU",
            p_dropout=0.00,
            n_vocab_out=50257).to('cuda'))

In [7]:
numel(model), numel(model)*4/1E9

(90924657, 0.363698628)

In [8]:
numel(model)*3*2.09*2048/1E12

1.16755988355072

In [9]:
model.n_layers

4

## initialize student

In [10]:
optimizer = AdamW(parameters=model.named_parameters())
dataset = GutenbergGPT2Dataset()
batch_size = None
example_length = model.n_ctx + 1
student= Student(
    model=model,
    optimizer=optimizer,
    dataset=dataset,
    batch_size=batch_size,
    example_length=example_length)

## schedule hyperparameters

In [11]:
student.batch_size=2048
for (idx, (pn, p)) in enumerate(student.model.named_parameters()):
    batch_multiplier = 1
    lr_base = 1e-6
    lr = lambda n: lr_base*(n%100) # *(.5 + sin(pi*n/1000)**2)
    student.optimizer.state[pn]["lr"]           = lambda n: lr(n)
    student.optimizer.state[pn]["beta1"]        = lambda n: 0.9
    student.optimizer.state[pn]["beta2"]        = lambda n: 0.999
    student.optimizer.state[pn]["weight_decay"] = lambda n: 0.001
    student.optimizer.state[pn]["update"]       = lambda n: n%batch_multiplier == 0

## initialize baseline

In [80]:
student.reset_baseline()
n_of_last_baseline = len(student.times)-1

## start training

In [13]:
classroom = Classroom()

In [14]:
classroom.enroll(student)

## autocomplete

In [15]:
def autocomplete(model, prompt=None, n_generate=512,
                     n_ctx=None, temp=1.0,
                     encode=None, decode=None, output=None):
    Categorical = torch.distributions.Categorical
    if n_ctx is None:
        n_ctx = model.n_ctx
    if encode is None:
        encode = gpt2encode
    if decode is None:
        decode = gpt2decode
    if prompt is None:
        prompt = decode(student.dataset.batch(1, 2*n_ctx, offset=None).tolist()[0])  # kludge
    x = encode(prompt)
    x = x[-n_ctx:]
    prompt = decode(x)
    print(f"=== Prompt ===\n{prompt}\n=== Autocompletion ===\n")

    def sampler(x):
        x = list(x)
        for _ in range(n_generate):
            probs = model.inference(torch.tensor(x, dtype=torch.long, device="cuda").unsqueeze(0)).view(-1)[-model.n_vocab_out:]
            if temp > 0:
                y = Categorical(probs=probs**(1.0/temp)).sample().item()
            else:
                y = torch.argmax(probs).item()
            x = (x + [y])[-n_ctx:]
            if output is not None:
                output.append(y)
            yield y
    result = decode(list(sampler(x)))
    print(result)


In [99]:
autocomplete(model=student.model, temp=0.8)

=== Prompt ===
 of a
motion institutes that motion. So long as man thinks of the conditions
and motions of disease, so long will the conditions and motions of
disease exist within him. If man will think only of perfect health, the
Principle of Health within him will maintain normal conditions.

To be well, man must form a conception of perfect health, and hold
thoughts harmonious with that conception as regards himself and all
things. He must think only of healthy conditions and functioning; he
must not permit a thought of unhealthy or abnormal conditions or

=== Autocompletion ===

provinces, which is the "regulation" in which thou shalt
brother by the Government in goodness, and to bring these questions, has
been working any other people. It will be the first to leave Posilh
showing that Christianity should be; and as it is for so well-eminence
for the men of all.

It was known in this place is formed of utility that remains
of this, consideration and deed, defects of education and
t

## plots

In [88]:
import time
plot_data = {}
lag = 50
for (idx, student) in enumerate(classroom.students):
    X = Fun(Log2Sum(), student.times)
    Y = Fun(TwoWindowFilter(lag=lag), student.grades)
    Z = Fun(TwoWindowFilter(lag=lag), student.baseline_grades)
    plot_data.update({f"grades-{idx}": (X, Y)})
    plot_data.update({f"baseline-{idx}": (X, Z)})
Plot(**plot_data)



In [24]:
import time
plot_data_2 = {}
lag = 100
for (idx, student) in enumerate(classroom.students):
    X = Fun(Count(), student.times)
    Y = Fun(lambda x, y: x - y, student.grades, student.baseline_grades)
    Y = Fun(TwoWindowFilter(lag=lag), Y.output, aux=Y)
    plot_data_2.update({f"improvement-{idx}": (X, Y)})
Plot(**plot_data_2)



## stats

In [90]:
import time

n = len(student.times)-1
t = time.time()
dn = n - n_of_last_baseline
dt = t - t_of_last_baseline

N = min(1000, dn//2)

y = np.mean(np.array(student.grades[n-N:n]))
y0 = np.mean(np.array(student.baseline_grades[n-N:n]))
dy = y - y0

message = '\n'.join([
    f"batch_size            = {student.batch_size}",
    f"example_length        = {student.example_length}",
    f"100*y                 = {int(y*1e6)/1e4}",
    f"n                     = {n} steps",
    f"t                     = {int(t)} seconds",
    f"n_of_last_baseline    = {n_of_last_baseline} steps",
    f"t_of_last_baseline    = {int(t_of_last_baseline)} seconds",
    f"steps per second      = {dn/dt}",
    f"y0                    = {int(y0*1e6)/1e6}",
    f"dy                    = {int(dy*1e6)/1e6}",
    f"dn                    = {dn}",
    f"dt                    = {dt}",
    f"dy/dn                 = {dy/dn}",
    f"dy/dt                 = {int(1e6 * dy/dt)}e-6 per second",
    f"time to 71            = {(.71-y)/(dy/dt)/3600} hours",
])
print(message)

batch_size            = 2048
example_length        = 129
100*y                 = 65.758
n                     = 134756 steps
t                     = 1634730424 seconds
n_of_last_baseline    = 101230 steps
t_of_last_baseline    = 1634703172 seconds
steps per second      = 1.2302412784605616
y0                    = 0.651364
dy                    = 0.006216
dn                    = 33526
dt                    = 27251.564865350723
dy/dn                 = 1.854237269254388e-07
dy/dt                 = 0e-6 per second
time to 71            = 63.831359605240394 hours


In [82]:
if True:
    student.reset_baseline()
    n_of_last_baseline = len(student.times)-1
    t_of_last_baseline = time.time()

## save

In [None]:
torch.save(student.model, f=path)

In [None]:
import asyncio
async def autosave():
    while True:
        await asyncio.sleep(3600)
        torch.save(student.model, f='autosave.pt')
task = asyncio.create_task(autosave())


In [None]:
lyles_constant = 9115131782/14818489608 #* log(50257)/log(65536)
lyles_constant

In [None]:
utf8grade = lambda x: 1 - (1 - x)*lyles_constant
grade = .655
print(f"gpt2 grade = {grade}, utf8 grade = {utf8grade(grade)}, bpc = {(1-utf8grade(grade))*8}")

## parameter histograms

In [None]:
histograms = []
for (idx, (pn, p)) in enumerate(student.model.named_parameters()):
    with torch.no_grad():
        print(idx, pn, torch.sqrt(torch.var(p)).item())
        Y, X = np.histogram(p.detach().cpu().numpy(), bins=int(sqrt(torch.numel(p))), density=True)
        print(X.shape, Y.shape)
        histograms.append(Plot(**{f"hist-{idx}": (X.tolist(), Y.tolist())}))

In [None]:
histograms[4] # 3 7 9 13 15 21 43

## batch-level grade histogram

In [None]:
Y, X = np.histogram(student.grades[-5000:], bins=256, range=(0,1.0), density=True)
V, U = np.histogram(student.baseline_grades[-5000:], bins=256, range=(0,1.0), density=True)
Plot(**{f"grade-hist": (X, Y), "baseline": (U, V)})

In [None]:
model.n_ctx, model.d_model, model.d_hidden, model.n_layers

## example-level grade histogram

In [None]:
def get_graded_examples():
    result = []
    for batch_idx in range(16):
        print(f"batch_idx = {batch_idx}/256")
        x = student.dataset.batch(student.batch_size, student.example_length)
        print(f"orig {x.shape}")
        with torch.no_grad():
            y = student.model(x)
            y = 1.0 - y.cpu().numpy()
            result.append(y)
    data = np.concatenate(result, axis=0)
    result = data.tolist()
    return result

In [None]:
graded_examples = get_graded_examples()

In [None]:
len(graded_examples)

In [None]:
sum(x for x in graded_examples)/len(graded_examples)

In [None]:
R = (0, 1)
def XYFor(k):
    es = graded_examples
    bins = int(sqrt(len(es)))
    Y, X = np.histogram(es, bins=bins, range=R, density=True)
    return (X, Y)
Plot(**{f"examples-hist-{k}": XYFor(k) for k in [1]})

In [None]:
ord(' ')

In [None]:
np.mean(example_grades)

In [None]:
(1 - 0.7870894884999871)*8

In [None]:
(1 - 0.8)*8

In [None]:
(1 - 0.9)*8

In [None]:
x = np.array([[1,2],[3,4]],dtype=np.uint8)

In [None]:
np.unpackbits(x, axis=1)