In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
with open("./wizard-of-oz.txt", encoding="utf-8-sig") as f:
    txt = f.read()

VOCAB = sorted(list(set(txt)))
VOCAB_SIZE = len(VOCAB)
encode_dict = {c:i for i, c in enumerate(VOCAB)}
decode_dict = {i:c for c, i in encode_dict.items()}

def tok_encode(text):
    return [encode_dict[c] for c in text]
def tok_decode(tok_indices):
    return "".join(decode_dict[e] for e in tok_indices)

tok_decode(tok_encode(txt)) == txt

In [None]:
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
data = torch.LongTensor(tok_encode(txt)).to(DEVICE)
split = int(len(data) * 0.9)
train_data = data[:split]
val_data = data[split:]

# Bigram Language Model


In [None]:
import torch
from torch.utils.data import DataLoader
from src import bigram
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model_nn_smol = bigram.BigramLanguageModel(vocab_sz=VOCAB_SIZE).to(DEVICE)

num_epochs = 16
batch_size = 64
dataset = bigram.BigramDataset(txt_tensor=train_data, device=DEVICE)
dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
optimizer = torch.optim.AdamW(model_nn_smol.parameters(), lr=1e-4)

num_samples = []
samples_counter = 0
train_losses = []
val_losses = []
for num_epoch in range(num_epochs):
    for ii, (x, y) in enumerate(dataloader):
        loss = model_nn_smol.train_batch(x, y, optimizer)
        samples_counter += len(x)

        if (ii+1) % 1000 == 0 or ii + 1 == len(dataloader):
            with torch.no_grad():
                train_loss = model_nn_smol.compute_loss(train_data[:-1], train_data[1:])
                val_loss = model_nn_smol.compute_loss(val_data[:-1], val_data[1:])
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                num_samples.append(samples_counter)

            print("\r" * 100 + f"epoch {num_epoch}: {ii+1}/{len(dataloader)}", end="", flush=True)
print()

In [None]:
from plotnine import *
import pandas as pd
df = pd.DataFrame.from_dict({
    "num_samples": num_samples,
    "train_losses": train_losses,
    "val_losses": val_losses,
}).pivot_longer(column_names=["train_losses", "val_losses"], names_to="what")

ggplot(df) + geom_line(aes(x="num_samples", y="value", color="what"))

In [None]:
model_nn_smol.compute_loss(train_data[:-1], train_data[1:])

In [None]:
lut = model_nn_smol.lut.weight.detach().to("cpu").softmax(dim=-1)

Lektion

- str.count() zählt keine überlappenden Muster!


In [None]:
# bigram lut manuell

lut_counting = torch.zeros((VOCAB_SIZE, VOCAB_SIZE))
for c1, c2 in zip(train_data, train_data[1:]):
    lut_counting[c1, c2] += 1
# lut_counting = lut_counting.softmax(dim=-1)
lut_counting = lut_counting / lut_counting.norm(dim=1, keepdim=True, p=1)
lut_counting

In [None]:
lut[0, :]

In [None]:
lut_counting[0, :]

In [None]:
import numpy as np
idx = np.random.randint(low=0, high=VOCAB_SIZE, size=(100,))
out = ""
for ii in range(len(idx)):
    out += tok_decode([idx[ii]])

out


In [None]:
tok_indices = model_nn_smol.generate(torch.tensor([0], device=DEVICE), 64)
tok_decode(tok_indices)

In [None]:
model_nn_smol.compute_loss(val_data[:-1], val_data[1:])

In [None]:
-np.log(1/VOCAB_SIZE)

# Neural Language Model


## Smol Neural Model


In [None]:
from src import neural
model_nn_smol = neural.NeuralLanguageModel(
    vocab_sz=VOCAB_SIZE,
    embedding_size=84,
    lin_size=64,
)
model_nn_smol.to(DEVICE)


In [None]:
from torch.utils.data import DataLoader
num_epochs = 16
batch_size = 64
dataset = neural.NeuralDataset(txt_tensor=train_data, device=DEVICE)
dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
optimizer = torch.optim.AdamW(model_nn_smol.parameters(), lr=1e-5)

num_samples = []
samples_counter = 0
train_losses = []
val_losses = []

train_data_x = torch.stack([train_data[ii:ii+model_nn_smol.context_size] for ii in range(len(train_data)-3)])
train_data_y = train_data[model_nn_smol.context_size:]
val_data_x = torch.stack([val_data[ii:ii+model_nn_smol.context_size] for ii in range(len(val_data)-3)])
val_data_y = val_data[model_nn_smol.context_size:]

for num_epoch in range(num_epochs):
    for ii, (x, y) in enumerate(dataloader):
        loss = model_nn_smol.train_batch(x, y, optimizer)
        samples_counter += len(x)

        if (ii+1) % 1000 == 0 or ii + 1 == len(dataloader):
            with torch.no_grad():
                train_loss = model_nn_smol.compute_loss(train_data_x, train_data_y)
                val_loss = model_nn_smol.compute_loss(val_data_x, val_data_y)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                num_samples.append(samples_counter)

            print("\r" * 100 + f"epoch {num_epoch}: {ii+1}/{len(dataloader)}", end="", flush=True)
print()

In [None]:
import pandas as pd
import janitor
from plotnine import *
df = pd.DataFrame.from_dict({
    "num_samples": num_samples,
    "train_losses": train_losses,
    "val_losses": val_losses,
}).pivot_longer(column_names=["train_losses", "val_losses"], names_to="what")

ggplot(df) + geom_line(aes(x="num_samples", y="value", color="what"))

In [None]:
model_nn_smol.compute_loss(train_data_x, train_data_y), model_nn_smol.compute_loss(val_data_x, val_data_y),


In [None]:
prompt=torch.tensor(tok_encode("Hi "), dtype=torch.long, device=DEVICE)
tok = model_nn_smol.generate(prompt, 1024)
print(tok_decode(tok))

## Swole Neural Model


In [None]:
import torch.nn.functional as F
from src import neural
model_nn_swole = neural.NeuralLanguageModel(
    vocab_sz=VOCAB_SIZE,
    embedding_size=384,
    lin_size=512,
    context_size=16,
    nonlin = F.relu,
)
model_nn_swole.to(DEVICE)

sum(p.numel() for p in model_nn_swole.parameters()), \
    model_nn_swole.compute_loss(train_data_x, train_data_y), model_nn_swole.compute_loss(val_data_x, val_data_y),


In [None]:
from torch.utils.data import DataLoader
num_epochs = 16
batch_size = 64
dataset = neural.NeuralDataset(txt_tensor=train_data, device=DEVICE, context_size=model_nn_swole.context_size)
dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
optimizer = torch.optim.AdamW(model_nn_swole.parameters(), lr=1e-7)

num_samples = []
samples_counter = 0
train_losses = []
val_losses = []

train_data_x = torch.stack([train_data[ii:ii+model_nn_swole.context_size] for ii in range(len(train_data)-model_nn_swole.context_size)])
train_data_y = train_data[model_nn_swole.context_size:]
val_data_x = torch.stack([val_data[ii:ii+model_nn_swole.context_size] for ii in range(len(val_data)-model_nn_swole.context_size)])
val_data_y = val_data[model_nn_swole.context_size:]

for num_epoch in range(num_epochs):
    for ii, (x, y) in enumerate(dataloader):
        loss = model_nn_swole.train_batch(x, y, optimizer)
        samples_counter += len(x)

        if (ii+1) % 1000 == 0 or ii + 1 == len(dataloader):
            with torch.no_grad():
                train_loss = model_nn_swole.compute_loss(train_data_x, train_data_y)
                val_loss = model_nn_swole.compute_loss(val_data_x, val_data_y)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                num_samples.append(samples_counter)

            print("\r" * 100 + f"epoch {num_epoch}: {ii+1}/{len(dataloader)}, {train_loss=:.2f}, {val_loss=:.2f}", end="", flush=True)
print()


In [None]:
import pandas as pd
import janitor
from plotnine import *
df = pd.DataFrame.from_dict({
    "num_samples": num_samples,
    "train_losses": train_losses,
    "val_losses": val_losses,
}).pivot_longer(column_names=["train_losses", "val_losses"], names_to="what")

ggplot(df) + geom_line(aes(x="num_samples", y="value", color="what"))


In [None]:
model_nn_swole.compute_loss(train_data_x, train_data_y), model_nn_swole.compute_loss(val_data_x, val_data_y),


In [None]:
prompt=torch.tensor(tok_encode("Is this the real life?"[:model_nn_swole.context_size]), dtype=torch.long, device=DEVICE)
tok = model_nn_swole.generate(prompt, 1024)
print(tok_decode(tok))


# Transformer Model


In [None]:
from src import transformer

In [None]:
makeTransformer = transformer.MakeTransformer(
    transformerClass=transformer.Transformer,
    textCorpus=txt,
    numLayers=5,
    embeddingSize=350,
    headSize=7,
    blockSize=64,
    linScale=3,
    dropout=0.2,
    maxIters=5000,
    learningRate=0.00005,
    batchSize=64,
    evalInterval=500,
    evalIters=200,
)
sum([p.numel() for p in makeTransformer.model.parameters()])

In [None]:
makeTransformer.train_model()

In [None]:
print(makeTransformer.model.generate(". ", 500))