In [1]:
import asyncio
import math
from math import log, sin, cos, tan, exp, sqrt, pi
import time
from random import randrange
import torch
import numpy as np
from classroom import Classroom
from classroom import Student
from classroom import BytesDataset
from classroom import GutenbergSnippetsDataset
from classroom import MLPLM, MyLM
from classroom import TransformerLM
from classroom import AdamW
from classroom import Sonny
from classroom import Floyd
from classroom import Plot
from classroom import Fun
from classroom import Count
from classroom import Sum
from classroom import Diff
from classroom import Log2Sum
from classroom import KalmanFilter1D
from classroom import MedianFilter
from classroom import TwoWindowFilter
from classroom import numel

In [2]:
if True:
    path = 'MyLM_32_256_32_1_8192_GELU_0_256.pt'
    model = torch.load(path).to('cuda')

In [None]:
if False:
    model = (
        MyLM(
            n_ctx=256,
            n_vocab_in=256,
            d_model=8,
            n_layers=2,
            d_hidden=4096,
            nonlinearity="GELU",
            p_dropout=0.0,
            n_vocab_out=256).to('cuda'))

In [3]:
optimizer = AdamW(parameters=model.named_parameters())
dataset = GutenbergSnippetsDataset()
batch_size = 512
example_length = model.n_ctx + 1
student= Student(
    model=model,
    optimizer=optimizer,
    dataset=dataset,
    batch_size=batch_size,
    example_length=example_length)

In [4]:
for (idx, (pn, p)) in enumerate(student.model.named_parameters()):
    with torch.no_grad():
        mu = torch.sqrt(torch.var(p)).item()
        # print(idx, pn, mu)
    batch_multiplier = 2
    lr = lambda n: 1.0e-5
    s = lambda n: .25+.5*sin(pi*n/(1000))**2
    student.optimizer.state[pn]["lr"]           = lambda n: lr(n) * s(n)
    student.optimizer.state[pn]["beta1"]        = lambda n: 0.9
    student.optimizer.state[pn]["beta2"]        = lambda n: 0.999
    student.optimizer.state[pn]["weight_decay"] = lambda n: 0.001 * s(n)
    student.optimizer.state[pn]["update"]       = lambda n: n%batch_multiplier == 0
    student.batch_size=1024

In [5]:
numel(model)

60869888

In [6]:
student.push()
time_of_last_baseline = student.time

In [7]:
classroom = Classroom()

In [8]:
classroom.enroll(student)

In [9]:
classroom.students

{<classroom.student.student.Student at 0x7f4214292520>: <Task pending name='Task-1' coro=<Classroom.enroll.<locals>._train() running at /home/sharker/github/classroom/classroom/classroom/classroom.py:18> wait_for=<Future finished result=None>>}

In [10]:
numel(student.model)

60869888

## Autocompleting

In [11]:
def autocomplete(prompt=None):
    for (idx, student) in enumerate(classroom.students):
        print(f"\n\nStudent #{idx}\n==========")
        print(student.autocomplete(prompt=prompt, n_generate=1024))
autocomplete()



Student #0
=== Prompt ===
and in the arts, from the ponderous pit-saw to the finest
lance
=== Autocompletion ===

oling-princise."

Now shring the lady way! She rose with you, see!" exclaimed Charlon, I was the Amzic of an April and a Bantino. Siburban Church.

Unfirst red Biggs had reignais of a reed room without their
pourence, all in his college and his wife had passed it
a gor. In it.  She's good a few weeps will about 11 a child of any of the train of the old tappeared,
was sprintly estimated to a common rumo, or emoracles de elgo di
Carson, condemnedente
    International Editorworms,
Cardinal, 1111, p. 497: and incompetus rendered to the class, preached him as isolated him.  It is two legal to my very cutting out our kneelinen;
murmureder, sinä voient quoi tacune gravailla.

E Quetend, lui dit-il,
  Hellen se prouve matinality, così nuori situa. De June posarie viento, le porte, change view of the countery rain; it was conversed, between himself and
diama had of ideal in a sei

## Plots

In [12]:
import time
plot_data = {}
lag = 1024
for (idx, student) in enumerate(classroom.students):
    X = Fun(Count(), student.times)
    Y = Fun(TwoWindowFilter(lag=lag), student.grades)
    Z = Fun(TwoWindowFilter(lag=lag), student.baseline_grades)
    plot_data.update({f"grades-{idx}": (X, Y)})
    plot_data.update({f"baseline-{idx}": (X, Z)})
Plot(**plot_data)



In [13]:
import time
plot_data_2 = {}
lag = 8192
for (idx, student) in enumerate(classroom.students):
    X = Fun(Sum(), student.times)
    Y = Fun(lambda x, y: x - y, student.grades, student.baseline_grades)
    Y = Fun(TwoWindowFilter(lag=lag), Y.output, aux=Y)
    plot_data_2.update({f"improvement-{idx}": (X, Y)})
Plot(**plot_data_2)



## some stats

In [None]:
for key in student.optimizer.state:
    print(key)

In [14]:
for (idx, student) in enumerate(classroom.students):
    print(f"\nStudent #{idx}\n==========")
    N = 8192
    n = len(student.times)-1
    time = student.time #sum(student.times[:n])
    mean_grade = np.mean(np.array(student.grades[n-N:n]))
    mean_baseline_grade = np.mean(np.array(student.baseline_grades[n-N:n]))
    mean_predicted_grade = np.mean(np.array(student.predicted_grades[n-N:n]))
    accuracy = 1.0 - abs(mean_predicted_grade - mean_grade)/(mean_grade)

    mean_improvement = mean_grade - mean_baseline_grade
    improvement_rate = mean_improvement / (time - time_of_last_baseline)
    time_to_level = 0.01/improvement_rate
    message = '\n'.join([
        f"lr                    = {student.optimizer.state['language_model.module.layers.0.weight']['lr'](n)}",
        f"batch_size            = {student.batch_size}",
        f"example_length        = {student.example_length}",
        f"n                     = {n}",
        f"time                  = {int(time)}s",
        f"time_of_last_baseline = {int(time_of_last_baseline)}s",
        f"steps per second      = {(n/time)}",
        f"mean_baseline_grade   = {mean_baseline_grade}",
        f"mean_grade            = {mean_grade}",
        f"mean_predicted_grade  = {mean_predicted_grade}",
        f"accuracy              = {accuracy}",
        f"mean_improvement      = {mean_improvement}",
        f"improvement_rate      = {improvement_rate} per second",
        f"time_to_level         = {time_to_level}"
    ])
    print(message)


Student #0
lr                    = 4.363073185791546e-06
batch_size            = 1024
example_length        = 33
n                     = 209
time                  = 27s
time_of_last_baseline = 0s
steps per second      = 7.7138483093247325
mean_baseline_grade   = 0.8517923010023016
mean_grade            = 0.8521360069442023
mean_predicted_grade  = 0.8511586687817243
accuracy              = 0.9988530725676259
mean_improvement      = 0.00034370594190069603
improvement_rate      = 1.268562439634234e-05 per second
time_to_level         = 788.2938740392874


## saving, histogram

In [18]:
torch.save(student.model, f='MyLM_32_256_32_1_8192_GELU_0_256.pt')

In [None]:
histograms = []
for (idx, (pn, p)) in enumerate(student.model.named_parameters()):
    with torch.no_grad():
        print(idx, pn, torch.sqrt(torch.var(p)).item())
        Y, X = np.histogram(p.detach().cpu().numpy(), bins=int(sqrt(torch.numel(p))), density=True)
        print(X.shape, Y.shape)
        histograms.append(Plot(**{f"hist-{idx}": (X.tolist(), Y.tolist())}))

In [None]:
histograms[3]

## grade/loss histogram

In [None]:
Y, X = np.histogram(student.grades, bins=256, range=(.8,1.0), density=True)
V, U = np.histogram(student.baseline_grades, bins=256, range=(.8,1.0), density=True)
Plot(**{f"grade-hist": (X, Y), "baseline": (U, V)})

In [17]:
model.n_ctx, model.d_model, model.d_hidden, model.n_layers

(32, 32, 8192, 1)