# Lab Template

## import libraries

In [1]:
import asyncio
import math
from math import log, sin, cos, tan, exp, sqrt, pi
import time
from random import randrange
import torch
import numpy as np
from classroom import Classroom
from classroom import Student
from classroom import BytesDataset
from classroom import GutenbergSnippetsDataset
from classroom import MLPLM, MyLM
from classroom import TransformerLM
from classroom import AdamW
from classroom import Sonny
from classroom import Floyd
from classroom import Plot
from classroom import Fun
from classroom import Count
from classroom import Sum
from classroom import Diff
from classroom import Log2Sum
from classroom import KalmanFilter1D
from classroom import MedianFilter
from classroom import TwoWindowFilter
from classroom import numel
from classroom import utf8decode

## initialize model

In [2]:
if True:
    path = '2021-10-01-1815.pt'
    model = torch.load(path).to('cuda')

In [None]:
if False:
    model = (
        MLPLM(
            n_ctx=512,
            n_vocab_in=256,
            d_model=2,
            d_hidden=[1024, 1024],
            nonlinearity="GELU",
            n_vocab_out=256).to('cuda'))

In [None]:
numel(model), numel(model)*4/1E9

## initialize student

In [3]:
optimizer = AdamW(parameters=model.named_parameters())
dataset = GutenbergSnippetsDataset()
batch_size = None
example_length = model.n_ctx + 1
student= Student(
    model=model,
    optimizer=optimizer,
    dataset=dataset,
    batch_size=batch_size,
    example_length=example_length)

## schedule hyperparameters

In [4]:
student.batch_size=2048
for (idx, (pn, p)) in enumerate(student.model.named_parameters()):
    batch_multiplier = 1
    lr_base = 1e-10
    lr = lambda n: lr_base * (n%5000) #* cos(3.14159*n/batch_multiplier/256)**2
    student.optimizer.state[pn]["lr"]           = lambda n: lr(n)
    student.optimizer.state[pn]["beta1"]        = lambda n: 0.9
    student.optimizer.state[pn]["beta2"]        = lambda n: 0.999
    student.optimizer.state[pn]["weight_decay"] = lambda n: 0.001
    student.optimizer.state[pn]["update"]       = lambda n: n%batch_multiplier == 0

## initialize baseline

In [5]:
student.reset_baseline()

## start training

In [6]:
classroom = Classroom()

In [7]:
classroom.enroll(student)

## autocomplete

In [None]:
def autocomplete(prompt=None):
    if prompt is None:
        prompt = student.dataset.decode(student.dataset.batch(1, 768).tolist()[0])
    print(student.autocomplete(prompt=prompt, n_generate=1024, temp=1.0))
autocomplete()

## plots

In [None]:
import time
plot_data = {}
lag = 2048
for (idx, student) in enumerate(classroom.students):
    X = Fun(Count(), student.times)
    Y = Fun(TwoWindowFilter(lag=lag), student.grades)
    Z = Fun(TwoWindowFilter(lag=lag), student.baseline_grades)
    plot_data.update({f"grades-{idx}": (X, Y)})
    plot_data.update({f"baseline-{idx}": (X, Z)})
Plot(**plot_data)

In [8]:
import time
plot_data_2 = {}
lag = 1000
for (idx, student) in enumerate(classroom.students):
    X = Fun(Count(), student.times)
    Y = Fun(lambda x, y: x - y, student.grades, student.baseline_grades)
    Y = Fun(TwoWindowFilter(lag=lag), Y.output, aux=Y)
    plot_data_2.update({f"improvement-{idx}": (X, Y)})
Plot(**plot_data_2)



## stats

In [12]:
for (idx, student) in enumerate(classroom.students):
    print(f"\nStudent #{idx}\n==========")
    N = 100
    n = len(student.times)-1
    time = student.time
    mean_grade = np.mean(np.array(student.grades[n-N:n]))
    mean_baseline_grade = np.mean(np.array(student.baseline_grades[n-N:n]))
    mean_improvement = mean_grade - mean_baseline_grade
    improvement_rate = mean_improvement / (time - student.time_of_last_baseline)
    message = '\n'.join([
        f"lr                    = {student.optimizer.state['language_model.module.layers.0.weight']['lr'](n)}",
        f"batch_size            = {student.batch_size}",
        f"example_length        = {student.example_length}",
        f"n                     = {n}",
        f"time                  = {int(time)}s",
        f"time_of_last_baseline = {int(student.time_of_last_baseline)}s",
        f"steps per second      = {(n/time)}",
        f"mean_baseline_grade   = {mean_baseline_grade}",
        f"mean_grade            = {mean_grade}",
        f"mean_improvement      = {mean_improvement}",
        f"improvement_rate      = {improvement_rate} per second",
        f"time to 80            = {(.80-mean_grade)/improvement_rate/3600} hours",
    ])
    print(message)


Student #0
lr                    = 2.389e-07
batch_size            = 2048
example_length        = 513
n                     = 42389
time                  = 695s
time_of_last_baseline = 0s
steps per second      = 60.933457885079314
mean_baseline_grade   = 0.6766741839051247
mean_grade            = 0.6767605674266816
mean_improvement      = 8.638352155687645e-05
improvement_rate      = 1.241748253733462e-07 per second
time to 80            = 275.685313926431 hours


In [None]:
8192*.61*numel(model)/1E9

## save

In [13]:
torch.save(student.model, f='2021-10-01-1815.pt')

In [None]:
lyles_constant = 9115131782/14818489608 #* log(50257)/log(65536)
lyles_constant

In [None]:
mg = 0.792844

In [None]:
(1 - mg) / lyles_constant * log(50257)  # gpt2natscale (for kaplan paper comparison)

In [None]:
(1 - mg) * 8  #  bits per character

In [None]:
2**((1 - mg) * 8)  # perplexity

## parameter histograms

In [None]:
histograms = []
for (idx, (pn, p)) in enumerate(student.model.named_parameters()):
    with torch.no_grad():
        print(idx, pn, torch.sqrt(torch.var(p)).item())
        Y, X = np.histogram(p.detach().cpu().numpy(), bins=int(sqrt(torch.numel(p))), density=True)
        print(X.shape, Y.shape)
        histograms.append(Plot(**{f"hist-{idx}": (X.tolist(), Y.tolist())}))

In [None]:
histograms[43] # 3 7 9 13 15 21 43

## batch-level grade histogram

In [None]:
Y, X = np.histogram(student.grades[-5000:], bins=256, range=(0,1.0), density=True)
V, U = np.histogram(student.baseline_grades[-5000:], bins=256, range=(0,1.0), density=True)
Plot(**{f"grade-hist": (X, Y), "baseline": (U, V)})

In [None]:
model.n_ctx, model.d_model, model.d_hidden, model.n_layers

## example-level grade histogram

In [None]:
def get_graded_examples():
    result = []
    for batch_idx in range(256):
        print(f"batch_idx = {batch_idx}/256")
        x = student.dataset.batch(student.batch_size, student.example_length)
        with torch.no_grad():
            y = student.model(x)
            x = x.cpu().numpy()
            y = 1.0 - y.cpu().numpy()
            result.append(np.concatenate([x, y], axis=1))
    data = np.concatenate(result, axis=0)
    result = data.tolist()
    return result

In [None]:
graded_examples = get_graded_examples()

In [None]:
sum(x[-1] for x in graded_examples)/len(graded_examples)

In [None]:
example_grades = [[] for _ in range(64)]
for example in graded_examples:
    grade = example[-1]
    for k in range(1,64):
        if int(example[-k-1]) in [ord(' '), ord('\n'), ord('\r')]:
            example_grades[k].append(grade)
            break

In [None]:
tot = 0
for k in range(64):
    print(f"{k}, {len(example_grades[k])}, {sum(example_grades[k])/(len(example_grades[k])+1)}")
    tot += len(example_grades[k])
print(tot)

In [None]:
85680/524231

In [None]:
R = (0, 1)
def XYFor(k):
    es = example_grades[k]
    bins = int(sqrt(len(es)))
    Y, X = np.histogram(es, bins=bins, range=R, density=True)
    return (X, Y)
Plot(**{f"examples-hist-{k}": XYFor(k) for k in [1, 2, 3]})

In [None]:
ord(' ')

In [None]:
np.mean(example_grades)

In [None]:
(1 - 0.7870894884999871)*8

In [None]:
(1 - 0.8)*8

In [None]:
(1 - 0.9)*8

In [None]:
x = np.array([[1,2],[3,4]],dtype=np.uint8)

In [None]:
np.unpackbits(x, axis=1)