# Logs
## No weight sharing + direct
- `embedding_dim`: 32
- `window_size`: 4
- Train Metrics: Train Loss: 5.4947, Train Entropy: 7.9272, Train Perplexity: 243.41
- Val Metrics: Val Loss: 6.4856, Val Entropy: 9.3567, Val Perplexity: 655.61
- Test Metrics: Test Loss: 6.6741, Test Entropy: 9.6287, Test Perplexity: 791.63
- Tiny Shakespeare Metrics: Test Loss: 7.4938, Test Entropy: 10.8113, Test Perplexity: 1796.85

## Weight sharing + direct
- `embedding_dim`: 32
- `window_size`: 4
- Train Metrics: Train Loss: 5.5120, Train Entropy: 7.9521, Train Perplexity: 247.64
- Val Metrics: Val Loss: 6.6535, Val Entropy: 9.5990, Val Perplexity: 775.52
- Test Metrics: Test Loss: 6.8656, Test Entropy: 9.9050, Test Perplexity: 958.76
- Tiny Shakespeare Metrics: Test Loss: 7.6186, Test Entropy: 10.9913, Test Perplexity: 2035.64

## Weight sharing + direct (best model)
- `embedding_dim`: 128
- `window_size`: 16
- Train Metrics: Train Loss: 4.7928, Train Entropy: 6.9146, Train Perplexity: 120.64
- Val Metrics: Val Loss: 6.6115, Val Entropy: 9.5384, Val Perplexity: 743.59
- Test Metrics: Test Loss: 6.8086, Test Entropy: 9.8228, Test Perplexity: 905.62
- Tiny Shakespeare Metrics: Test Loss: 7.4555, Test Entropy: 10.7560, Test Perplexity: 1729.38

# Implement Neural Probabilistic LM (Bengio et. al 2003)

Implement the classic Neural LM paper using torch trained on 800K FineWeb Edu Tokens. We will implement more things like direct networks, weight tying to this model and report Per token NLL, Entropy and Perplexity on our test data

In [2]:
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import tiktoken
from sklearn.model_selection import train_test_split
from collections import Counter
import heapq
import random
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
%matplotlib inline
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f8fe89c6b50>

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# 1. Read Data

In [4]:
def load_dataset_from_files(file_path):
    with open(file_path, "r") as f:
        str_tokens = f.read().splitlines()
        tokens = [int(token) for token in str_tokens]

    return tokens

In [5]:
train_tokens = load_dataset_from_files("train_tokens.txt")
val_tokens = load_dataset_from_files("val_tokens.txt")
test_tokens = load_dataset_from_files("test_tokens.txt")
ts_tokens = load_dataset_from_files("ts_tokens.txt")

In [6]:
len(train_tokens), len(test_tokens), len(val_tokens), len(ts_tokens)

(800258, 100033, 100032, 338025)

# 2. Helper Functions

In [7]:
def prepare_dataset(tokens, context_window_size):
    x, y = [], []

    for i in range(len(tokens) - context_window_size):
        x.append(tokens[i : i + context_window_size])
        y.append(tokens[i + context_window_size])

    x = torch.LongTensor(x)
    y = torch.LongTensor(y)

    return x, y

In [8]:
def get_metrics(model, tokens, context_window_size):
    x_tensor, y_tensor = prepare_dataset(tokens, context_window_size)
    dataset = TensorDataset(x_tensor, y_tensor)
    dataloader = DataLoader(dataset, batch_size=4096, shuffle=True, drop_last=False)
    criterion = nn.CrossEntropyLoss()

    # get initial metrics
    model.eval()
    tmp_loss = 0.0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device) # (B, T), (B, )
        logits = model(x)
        loss = criterion(logits, y)
        tmp_loss += loss.item() * (x.shape[0] / x_tensor.shape[0])

    perplexity = float(np.exp(tmp_loss))
    entropy = float(np.log2(perplexity))

    return tmp_loss, entropy, perplexity

In [9]:
def train(model, train_tokens, val_tokens, batch_size, num_epochs, lr, context_window_size):
    x_train, y_train = prepare_dataset(train_tokens, context_window_size)
    x_val, y_val = prepare_dataset(val_tokens, context_window_size)

    train_dataset = TensorDataset(x_train, y_train)
    val_dataset = TensorDataset(x_val, y_val)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last=False)

    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    metrics = {"NLL": [], "Entropy": [], "Perplexity": []}

    # get initial metrics
    model.eval()

    train_loss = 0.0
    for x, y in train_dataloader:
        x, y = x.to(device), y.to(device) # (B, T), (B, )
        logits = model(x)
        loss = criterion(logits, y)
        train_loss += loss.item() * (x.shape[0] / x_train.shape[0])

    train_perplexity = float(np.exp(train_loss))
    train_entropy = float(np.log2(train_perplexity))

    # eval
    model.eval()
    val_loss = 0.0
    for x, y in val_dataloader:
        x, y = x.to(device), y.to(device) # (B, T), (B, )
        logits = model(x)
        loss = criterion(logits, y)
        val_loss += loss.item() * (x.shape[0] / x_val.shape[0])

    val_perplexity = float(np.exp(val_loss))
    val_entropy = float(np.log2(val_perplexity))

    metrics["NLL"].append((train_loss, val_loss))
    metrics["Entropy"].append((train_entropy, val_entropy))
    metrics["Perplexity"].append((train_perplexity, val_perplexity))

    print(f"Start of training: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")


    # now start training

    for epoch in range(num_epochs):
        # train loop
        model.train()
        train_loss = 0.0
        for x, y in train_dataloader:
            x, y = x.to(device), y.to(device) # (B, T), (B, )
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * (x.shape[0] / x_train.shape[0])

        train_perplexity = float(np.exp(train_loss))
        train_entropy = float(np.log2(train_perplexity))

        # eval
        model.eval()
        val_loss = 0.0
        for x, y in val_dataloader:
            x, y = x.to(device), y.to(device) # (B, T), (B, )
            logits = model(x)
            loss = criterion(logits, y)
            val_loss += loss.item() * (x.shape[0] / x_val.shape[0])

        val_perplexity = float(np.exp(val_loss))
        val_entropy = float(np.log2(val_perplexity))

        metrics["NLL"].append((train_loss, val_loss))
        metrics["Entropy"].append((train_entropy, val_entropy))
        metrics["Perplexity"].append((train_perplexity, val_perplexity))

        print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    return metrics

# 3. Write Model

## 3.1 Standard Model descibed in paper

No weight tying yet, will add options to have direct connections.

**Paper formula**: $y = b + Wx + U \text{tanh}(d + Hx)$, here $y$ are the logits.

In [74]:
class NeuralLM(nn.Module):
    def __init__(self, vocab_size, context_window_size, embedding_dim=32, direct=False):
        super().__init__()
        self.vocab_size = vocab_size
        self.context_window_size = context_window_size
        self.direct = direct

        # model params
        self.C = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim * context_window_size, embedding_dim)
        self.linear2 = nn.Linear(embedding_dim, vocab_size)
        self.W = None

        if direct:
            self.W = nn.Parameter(torch.zeros(embedding_dim * context_window_size, vocab_size))

        # initialize by Xaview init
        nn.init.xavier_uniform_(self.C.weight)
        nn.init.xavier_uniform_(self.linear1.weight)
        nn.init.zeros_(self.linear1.bias)
        nn.init.xavier_uniform_(self.linear2.weight)
        nn.init.zeros_(self.linear2.bias)
        if direct:
            nn.init.xavier_uniform_(self.W.data)

    def forward(self, x):
        # x shape: (B, context_window)
        embeddings = self.C(x) # (B, T, C)
        B, T, C = embeddings.shape
        model_inp = embeddings.view(B, T * C)
        hidden = self.linear1(model_inp) # (B, C)
        hidden = torch.tanh(hidden)
        logits = self.linear2(hidden) # (B, V)
        if self.direct:
            logits += model_inp @ self.W # (B, T*C) @ (T*C, V)

        return logits

In [9]:
vocab_size = max(train_tokens) + 1
vocab_size

50257

### Exp.1: Small model and window size (no direct)

- `embedding_dim`: 32
- `window_size`: 4

In [10]:
embedding_dim = 32
context_window_size = 4

In [15]:
model = NeuralLM(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim)

In [16]:
model.to(device)

NeuralLM(
  (C): Embedding(50257, 32)
  (linear1): Linear(in_features=128, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=50257, bias=True)
)

In [17]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 3270833


In [18]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=20, lr=1e-3, context_window_size=context_window_size)

Start of training: Train Loss: 10.8249, Val Loss: 10.8249
Epoch 1: Train Loss: 8.6384, Val Loss: 7.6631
Epoch 2: Train Loss: 7.4912, Val Loss: 7.6584
Epoch 3: Train Loss: 7.4666, Val Loss: 7.6405
Epoch 4: Train Loss: 7.4077, Val Loss: 7.5621
Epoch 5: Train Loss: 7.3126, Val Loss: 7.4816
Epoch 6: Train Loss: 7.1956, Val Loss: 7.3901
Epoch 7: Train Loss: 7.0675, Val Loss: 7.2873
Epoch 8: Train Loss: 6.9275, Val Loss: 7.1791
Epoch 9: Train Loss: 6.7886, Val Loss: 7.0793
Epoch 10: Train Loss: 6.6435, Val Loss: 6.9818
Epoch 11: Train Loss: 6.5038, Val Loss: 6.9001
Epoch 12: Train Loss: 6.3709, Val Loss: 6.8273
Epoch 13: Train Loss: 6.2514, Val Loss: 6.7712
Epoch 14: Train Loss: 6.1477, Val Loss: 6.7324
Epoch 15: Train Loss: 6.0582, Val Loss: 6.7057
Epoch 16: Train Loss: 5.9794, Val Loss: 6.6854
Epoch 17: Train Loss: 5.9077, Val Loss: 6.6740
Epoch 18: Train Loss: 5.8416, Val Loss: 6.6658
Epoch 19: Train Loss: 5.7797, Val Loss: 6.6609
Epoch 20: Train Loss: 5.7215, Val Loss: 6.6586


In [22]:
print(f'Train Metrics: Train Loss: {metrics1["NLL"][-1][0]:.4f}, Train Entropy: {metrics1["Entropy"][-1][0]:.4f}, Train Perplexity: {metrics1["Perplexity"][-1][0]:.2f}')
print(f'Val Metrics: Val Loss: {metrics1["NLL"][-1][1]:.4f}, Val Entropy: {metrics1["Entropy"][-1][1]:.4f}, Val Perplexity: {metrics1["Perplexity"][-1][1]:.2f}')

Train Metrics: Train Loss: 5.7215, Train Entropy: 8.2544, Train Perplexity: 305.37
Val Metrics: Val Loss: 6.6586, Val Entropy: 9.6063, Val Perplexity: 779.44


### Exp.2: Larger model and window size (no direct)

- `embedding_dim`: 64
- `window_size`: 8

In [23]:
embedding_dim = 64
context_window_size = 8

In [33]:
model = NeuralLM(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim)

In [34]:
model.to(device)

NeuralLM(
  (C): Embedding(50257, 64)
  (linear1): Linear(in_features=512, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=50257, bias=True)
)

In [35]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 6515985


In [37]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=5, lr=1e-3, context_window_size=context_window_size)

Start of training: Train Loss: 5.8336, Val Loss: 6.6895
Epoch 1: Train Loss: 5.8827, Val Loss: 6.7172
Epoch 2: Train Loss: 5.7449, Val Loss: 6.7143
Epoch 3: Train Loss: 5.6321, Val Loss: 6.7139
Epoch 4: Train Loss: 5.5250, Val Loss: 6.7218
Epoch 5: Train Loss: 5.4229, Val Loss: 6.7390


In [38]:
print(f'Train Metrics: Train Loss: {metrics1["NLL"][-1][0]:.4f}, Train Entropy: {metrics1["Entropy"][-1][0]:.4f}, Train Perplexity: {metrics1["Perplexity"][-1][0]:.2f}')
print(f'Val Metrics: Val Loss: {metrics1["NLL"][-1][1]:.4f}, Val Entropy: {metrics1["Entropy"][-1][1]:.4f}, Val Perplexity: {metrics1["Perplexity"][-1][1]:.2f}')

Train Metrics: Train Loss: 5.4229, Train Entropy: 7.8236, Train Perplexity: 226.54
Val Metrics: Val Loss: 6.7390, Val Entropy: 9.7223, Val Perplexity: 844.69


### Exp.3: Largest model and window size (no direct)

- `embedding_dim`: 128
- `window_size`: 16

In [39]:
embedding_dim = 128
context_window_size = 16

In [40]:
model = NeuralLM(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim)

In [41]:
model.to(device)

NeuralLM(
  (C): Embedding(50257, 128)
  (linear1): Linear(in_features=2048, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=50257, bias=True)
)

In [42]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 13178321


In [43]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=10, lr=1e-3, context_window_size=context_window_size)

Start of training: Train Loss: 10.8249, Val Loss: 10.8249
Epoch 1: Train Loss: 7.9763, Val Loss: 7.6983
Epoch 2: Train Loss: 7.4525, Val Loss: 7.4868
Epoch 3: Train Loss: 7.1273, Val Loss: 7.2155
Epoch 4: Train Loss: 6.7672, Val Loss: 6.9607
Epoch 5: Train Loss: 6.4054, Val Loss: 6.7589
Epoch 6: Train Loss: 6.0837, Val Loss: 6.6781
Epoch 7: Train Loss: 5.8200, Val Loss: 6.6634
Epoch 8: Train Loss: 5.5889, Val Loss: 6.6906
Epoch 9: Train Loss: 5.3728, Val Loss: 6.7393
Epoch 10: Train Loss: 5.1656, Val Loss: 6.8131


In [45]:
print(f'Train Metrics: Train Loss: {metrics1["NLL"][-1][0]:.4f}, Train Entropy: {metrics1["Entropy"][-1][0]:.4f}, Train Perplexity: {metrics1["Perplexity"][-1][0]:.2f}')
print(f'Val Metrics: Val Loss: {metrics1["NLL"][-1][1]:.4f}, Val Entropy: {metrics1["Entropy"][-1][1]:.4f}, Val Perplexity: {metrics1["Perplexity"][-1][1]:.2f}')

Train Metrics: Train Loss: 5.1656, Train Entropy: 7.4524, Train Perplexity: 175.15
Val Metrics: Val Loss: 6.8131, Val Entropy: 9.8292, Val Perplexity: 909.64


### Exp.4: Small model and window size with direct

- `embedding_dim`: 32
- `window_size`: 4

In [46]:
embedding_dim = 32
context_window_size = 4

In [86]:
model = NeuralLM(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim, direct=True)

In [87]:
model.to(device)

NeuralLM(
  (C): Embedding(50257, 32)
  (linear1): Linear(in_features=128, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=50257, bias=True)
)

In [88]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 9703729


In [89]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=10, lr=1e-3, context_window_size=context_window_size)

Start of training: Train Loss: 10.8249, Val Loss: 10.8249
Epoch 1: Train Loss: 8.5126, Val Loss: 7.6690
Epoch 2: Train Loss: 7.4406, Val Loss: 7.5469
Epoch 3: Train Loss: 7.1953, Val Loss: 7.2398
Epoch 4: Train Loss: 6.8058, Val Loss: 6.9385
Epoch 5: Train Loss: 6.4589, Val Loss: 6.7549
Epoch 6: Train Loss: 6.1971, Val Loss: 6.6487
Epoch 7: Train Loss: 5.9868, Val Loss: 6.5780
Epoch 8: Train Loss: 5.8043, Val Loss: 6.5305
Epoch 9: Train Loss: 5.6416, Val Loss: 6.4999
Epoch 10: Train Loss: 5.4947, Val Loss: 6.4856


In [91]:
print(f'Train Metrics: Train Loss: {metrics1["NLL"][-1][0]:.4f}, Train Entropy: {metrics1["Entropy"][-1][0]:.4f}, Train Perplexity: {metrics1["Perplexity"][-1][0]:.2f}')
print(f'Val Metrics: Val Loss: {metrics1["NLL"][-1][1]:.4f}, Val Entropy: {metrics1["Entropy"][-1][1]:.4f}, Val Perplexity: {metrics1["Perplexity"][-1][1]:.2f}')

Train Metrics: Train Loss: 5.4947, Train Entropy: 7.9272, Train Perplexity: 243.41
Val Metrics: Val Loss: 6.4856, Val Entropy: 9.3567, Val Perplexity: 655.61


### Exp.5: Larger model and window size with direct

- `embedding_dim`: 64
- `window_size`: 8

In [97]:
embedding_dim = 64
context_window_size = 8

In [98]:
model = NeuralLM(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim, direct=True)

In [99]:
model.to(device)

NeuralLM(
  (C): Embedding(50257, 64)
  (linear1): Linear(in_features=512, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=50257, bias=True)
)

In [100]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 32247569


In [101]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=5, lr=1e-3, context_window_size=context_window_size)

Start of training: Train Loss: 10.8249, Val Loss: 10.8249
Epoch 1: Train Loss: 8.1273, Val Loss: 7.6341
Epoch 2: Train Loss: 7.2408, Val Loss: 7.1756
Epoch 3: Train Loss: 6.6422, Val Loss: 6.7916
Epoch 4: Train Loss: 6.1152, Val Loss: 6.5969
Epoch 5: Train Loss: 5.6824, Val Loss: 6.5117


In [102]:
model.W.grad.shape

torch.Size([512, 50257])

In [104]:
print(f'Train Metrics: Train Loss: {metrics1["NLL"][-1][0]:.4f}, Train Entropy: {metrics1["Entropy"][-1][0]:.4f}, Train Perplexity: {metrics1["Perplexity"][-1][0]:.2f}')
print(f'Val Metrics: Val Loss: {metrics1["NLL"][-1][1]:.4f}, Val Entropy: {metrics1["Entropy"][-1][1]:.4f}, Val Perplexity: {metrics1["Perplexity"][-1][1]:.2f}')

Train Metrics: Train Loss: 5.6824, Train Entropy: 8.1979, Train Perplexity: 293.64
Val Metrics: Val Loss: 6.5117, Val Entropy: 9.3944, Val Perplexity: 672.98


### Exp.6: Largest model and window size with direct

- `embedding_dim`: 128
- `window_size`: 16

In [114]:
embedding_dim = 128
context_window_size = 16

In [115]:
model = NeuralLM(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim, direct=True)

In [116]:
model.to(device)

NeuralLM(
  (C): Embedding(50257, 128)
  (linear1): Linear(in_features=2048, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=50257, bias=True)
)

In [117]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 116104657


In [119]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=5, lr=2e-4, context_window_size=context_window_size)

Start of training: Train Loss: 5.6615, Val Loss: 6.6814
Epoch 1: Train Loss: 5.6508, Val Loss: 6.6730
Epoch 2: Train Loss: 5.4276, Val Loss: 6.6537
Epoch 3: Train Loss: 5.2219, Val Loss: 6.6488
Epoch 4: Train Loss: 5.0183, Val Loss: 6.6556
Epoch 5: Train Loss: 4.8171, Val Loss: 6.6736


In [120]:
model.W.grad.shape

torch.Size([2048, 50257])

In [121]:
print(f'Train Metrics: Train Loss: {metrics1["NLL"][-1][0]:.4f}, Train Entropy: {metrics1["Entropy"][-1][0]:.4f}, Train Perplexity: {metrics1["Perplexity"][-1][0]:.2f}')
print(f'Val Metrics: Val Loss: {metrics1["NLL"][-1][1]:.4f}, Val Entropy: {metrics1["Entropy"][-1][1]:.4f}, Val Perplexity: {metrics1["Perplexity"][-1][1]:.2f}')

Train Metrics: Train Loss: 4.8171, Train Entropy: 6.9496, Train Perplexity: 123.61
Val Metrics: Val Loss: 6.6736, Val Entropy: 9.6279, Val Perplexity: 791.20


### Final: Best Model Test and Tiny Shakespeare metrics

In [122]:
embedding_dim = 32
context_window_size = 4

In [123]:
model = NeuralLM(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim, direct=True)

In [124]:
model.to(device)

NeuralLM(
  (C): Embedding(50257, 32)
  (linear1): Linear(in_features=128, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=50257, bias=True)
)

In [125]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 9703729


In [126]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=10, lr=1e-3, context_window_size=context_window_size)

Start of training: Train Loss: 10.8249, Val Loss: 10.8249
Epoch 1: Train Loss: 8.5179, Val Loss: 7.6688
Epoch 2: Train Loss: 7.4371, Val Loss: 7.5399
Epoch 3: Train Loss: 7.1770, Val Loss: 7.2171
Epoch 4: Train Loss: 6.7887, Val Loss: 6.9337
Epoch 5: Train Loss: 6.4530, Val Loss: 6.7488
Epoch 6: Train Loss: 6.1891, Val Loss: 6.6372
Epoch 7: Train Loss: 5.9773, Val Loss: 6.5692
Epoch 8: Train Loss: 5.7960, Val Loss: 6.5225
Epoch 9: Train Loss: 5.6355, Val Loss: 6.4957
Epoch 10: Train Loss: 5.4919, Val Loss: 6.4812


In [133]:
train_loss, train_entropy, train_perplexity = get_metrics(model, train_tokens, context_window_size)
print(f'Train Metrics: Train Loss: {train_loss:.4f}, Train Entropy: {train_entropy:.4f}, Train Perplexity: {train_perplexity:.2f}')

Train Metrics: Train Loss: 5.3550, Train Entropy: 7.7256, Train Perplexity: 211.66


In [134]:
val_loss, val_entropy, val_perplexity = get_metrics(model, val_tokens, context_window_size)
print(f'Val Metrics: Val Loss: {val_loss:.4f}, Val Entropy: {val_entropy:.4f}, Val Perplexity: {val_perplexity:.2f}')

Val Metrics: Val Loss: 6.4812, Val Entropy: 9.3504, Val Perplexity: 652.73


In [135]:
test_loss, test_entropy, test_perplexity = get_metrics(model, test_tokens, context_window_size)
print(f'Test Metrics: Test Loss: {test_loss:.4f}, Test Entropy: {test_entropy:.4f}, Test Perplexity: {test_perplexity:.2f}')

Test Metrics: Test Loss: 6.6741, Test Entropy: 9.6287, Test Perplexity: 791.63


In [136]:
ts_loss, ts_entropy, ts_perplexity = get_metrics(model, ts_tokens, context_window_size)
print(f'Tiny Shakespeare Metrics: Test Loss: {ts_loss:.4f}, Test Entropy: {ts_entropy:.4f}, Test Perplexity: {ts_perplexity:.2f}')

Tiny Shakespeare Metrics: Test Loss: 7.4938, Test Entropy: 10.8113, Test Perplexity: 1796.85


## 3.2 Weight tying

Implement weight tying where output layer embeddings and input layer embedding matrix are forecefully tied

**Paper formula**: $y = b + Wx + U \text{tanh}(d + Hx)$, here $y$ are the logits. 

In weight tying, we **force** $C = U$ where $C$ is input embedding matrix and $U$ is the output matrix

In [10]:
class NeuralLMWeightTied(nn.Module):
    def __init__(self, vocab_size, context_window_size, embedding_dim=32, direct=False):
        super().__init__()
        self.vocab_size = vocab_size
        self.context_window_size = context_window_size
        self.direct = direct

        # model params
        self.C = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim * context_window_size, embedding_dim)
        self.linear2 = nn.Linear(embedding_dim, vocab_size)
        self.W = None

        if direct:
            self.W = nn.Parameter(torch.zeros(embedding_dim * context_window_size, vocab_size))

        # initialize by Xaview init
        nn.init.xavier_uniform_(self.C.weight)
        nn.init.xavier_uniform_(self.linear1.weight)
        nn.init.zeros_(self.linear1.bias)
        # force weight tying
        self.C.weight = self.linear2.weight
        nn.init.zeros_(self.linear2.bias)
        if direct:
            nn.init.xavier_uniform_(self.W.data)


    def forward(self, x):
        # x shape: (B, context_window)
        embeddings = self.C(x) # (B, T, C)
        B, T, C = embeddings.shape
        model_inp = embeddings.view(B, T * C)
        hidden = self.linear1(model_inp) # (B, C)
        hidden = torch.tanh(hidden)
        logits = self.linear2(hidden) # (B, V)
        if self.direct:
            logits += model_inp @ self.W # (B, T*C) @ (T*C, V)

        return logits

In [11]:
vocab_size = max(train_tokens) + 1
vocab_size

50257

### Exp.1: Small model and window size with direct

- `embedding_dim`: 32
- `window_size`: 4

In [38]:
embedding_dim = 32
context_window_size = 4

In [39]:
model = NeuralLMWeightTied(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim, direct=True)

In [40]:
model.to(device)

NeuralLMWeightTied(
  (C): Embedding(50257, 32)
  (linear1): Linear(in_features=128, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=50257, bias=True)
)

In [41]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 8095505


In [42]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=10, lr=1e-3, context_window_size=context_window_size)

Start of training: Train Loss: 10.8258, Val Loss: 10.8256
Epoch 1: Train Loss: 8.6739, Val Loss: 7.7258
Epoch 2: Train Loss: 7.3059, Val Loss: 7.4537
Epoch 3: Train Loss: 6.9948, Val Loss: 7.2189
Epoch 4: Train Loss: 6.7170, Val Loss: 7.0339
Epoch 5: Train Loss: 6.4672, Val Loss: 6.8960
Epoch 6: Train Loss: 6.2485, Val Loss: 6.7964
Epoch 7: Train Loss: 6.0591, Val Loss: 6.7305
Epoch 8: Train Loss: 5.8936, Val Loss: 6.6876
Epoch 9: Train Loss: 5.7464, Val Loss: 6.6624
Epoch 10: Train Loss: 5.6138, Val Loss: 6.6485


In [45]:
(model.C.weight.grad == model.linear2.weight.grad).all()

tensor(True, device='cuda:0')

In [46]:
print(f'Train Metrics: Train Loss: {metrics1["NLL"][-1][0]:.4f}, Train Entropy: {metrics1["Entropy"][-1][0]:.4f}, Train Perplexity: {metrics1["Perplexity"][-1][0]:.2f}')
print(f'Val Metrics: Val Loss: {metrics1["NLL"][-1][1]:.4f}, Val Entropy: {metrics1["Entropy"][-1][1]:.4f}, Val Perplexity: {metrics1["Perplexity"][-1][1]:.2f}')

Train Metrics: Train Loss: 5.6138, Train Entropy: 8.0990, Train Perplexity: 274.18
Val Metrics: Val Loss: 6.6485, Val Entropy: 9.5918, Val Perplexity: 771.64


### Exp.2: Larger model and window size with direct

- `embedding_dim`: 64
- `window_size`: 8

In [75]:
embedding_dim = 64
context_window_size = 8

In [76]:
model = NeuralLMWeightTied(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim, direct=True)

In [77]:
model.to(device)

NeuralLMWeightTied(
  (C): Embedding(50257, 64)
  (linear1): Linear(in_features=512, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=50257, bias=True)
)

In [78]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 29031121


In [79]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=6, lr=1e-3, context_window_size=context_window_size)

Start of training: Train Loss: 10.8268, Val Loss: 10.8268
Epoch 1: Train Loss: 8.1433, Val Loss: 7.5364
Epoch 2: Train Loss: 7.0274, Val Loss: 7.1692
Epoch 3: Train Loss: 6.5482, Val Loss: 6.8930
Epoch 4: Train Loss: 6.1091, Val Loss: 6.7206
Epoch 5: Train Loss: 5.7242, Val Loss: 6.6360
Epoch 6: Train Loss: 5.3760, Val Loss: 6.6088


In [80]:
(model.C.weight.grad == model.linear2.weight.grad).all()

tensor(True, device='cuda:0')

In [81]:
print(f'Train Metrics: Train Loss: {metrics1["NLL"][-1][0]:.4f}, Train Entropy: {metrics1["Entropy"][-1][0]:.4f}, Train Perplexity: {metrics1["Perplexity"][-1][0]:.2f}')
print(f'Val Metrics: Val Loss: {metrics1["NLL"][-1][1]:.4f}, Val Entropy: {metrics1["Entropy"][-1][1]:.4f}, Val Perplexity: {metrics1["Perplexity"][-1][1]:.2f}')

Train Metrics: Train Loss: 5.3760, Train Entropy: 7.7559, Train Perplexity: 216.16
Val Metrics: Val Loss: 6.6088, Val Entropy: 9.5344, Val Perplexity: 741.57


### Exp.3: Largest model and window size with direct

- `embedding_dim`: 128
- `window_size`: 16

In [17]:
embedding_dim = 128
context_window_size = 16

In [18]:
model = NeuralLMWeightTied(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim, direct=True)

In [19]:
model.to(device)

NeuralLMWeightTied(
  (C): Embedding(50257, 128)
  (linear1): Linear(in_features=2048, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=50257, bias=True)
)

In [20]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 109671761


In [24]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=2, lr=5e-4, context_window_size=context_window_size)

Start of training: Train Loss: 4.7593, Val Loss: 6.5907
Epoch 1: Train Loss: 4.7574, Val Loss: 6.6580
Epoch 2: Train Loss: 4.3320, Val Loss: 6.7204


In [22]:
(model.C.weight.grad == model.linear2.weight.grad).all()

tensor(True, device='cuda:0')

In [23]:
print(f'Train Metrics: Train Loss: {metrics1["NLL"][-1][0]:.4f}, Train Entropy: {metrics1["Entropy"][-1][0]:.4f}, Train Perplexity: {metrics1["Perplexity"][-1][0]:.2f}')
print(f'Val Metrics: Val Loss: {metrics1["NLL"][-1][1]:.4f}, Val Entropy: {metrics1["Entropy"][-1][1]:.4f}, Val Perplexity: {metrics1["Perplexity"][-1][1]:.2f}')

Train Metrics: Train Loss: 5.1537, Train Entropy: 7.4353, Train Perplexity: 173.08
Val Metrics: Val Loss: 6.5907, Val Entropy: 9.5084, Val Perplexity: 728.28


### Final: Best Model Test and Tiny Shakespeare metrics

In [43]:
embedding_dim = 128
context_window_size = 16

In [44]:
model = NeuralLMWeightTied(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim, direct=True)

In [45]:
model.to(device)

NeuralLMWeightTied(
  (C): Embedding(50257, 128)
  (linear1): Linear(in_features=2048, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=50257, bias=True)
)

In [46]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 109671761


In [47]:
metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=6, lr=5e-4, context_window_size=context_window_size)

Start of training: Train Loss: 10.8253, Val Loss: 10.8257
Epoch 1: Train Loss: 8.1234, Val Loss: 7.5995
Epoch 2: Train Loss: 7.1193, Val Loss: 7.2566
Epoch 3: Train Loss: 6.6202, Val Loss: 6.9626
Epoch 4: Train Loss: 6.1236, Val Loss: 6.7611
Epoch 5: Train Loss: 5.6507, Val Loss: 6.6532
Epoch 6: Train Loss: 5.1878, Val Loss: 6.6115


In [48]:
(model.C.weight.grad == model.linear2.weight.grad).all()

tensor(True, device='cuda:0')

In [49]:
train_loss, train_entropy, train_perplexity = get_metrics(model, train_tokens, context_window_size)
print(f'Train Metrics: Train Loss: {train_loss:.4f}, Train Entropy: {train_entropy:.4f}, Train Perplexity: {train_perplexity:.2f}')

Train Metrics: Train Loss: 4.7928, Train Entropy: 6.9146, Train Perplexity: 120.64


In [50]:
val_loss, val_entropy, val_perplexity = get_metrics(model, val_tokens, context_window_size)
print(f'Val Metrics: Val Loss: {val_loss:.4f}, Val Entropy: {val_entropy:.4f}, Val Perplexity: {val_perplexity:.2f}')

Val Metrics: Val Loss: 6.6115, Val Entropy: 9.5384, Val Perplexity: 743.59


In [51]:
test_loss, test_entropy, test_perplexity = get_metrics(model, test_tokens, context_window_size)
print(f'Test Metrics: Test Loss: {test_loss:.4f}, Test Entropy: {test_entropy:.4f}, Test Perplexity: {test_perplexity:.2f}')

Test Metrics: Test Loss: 6.8086, Test Entropy: 9.8228, Test Perplexity: 905.62


In [52]:
ts_loss, ts_entropy, ts_perplexity = get_metrics(model, ts_tokens, context_window_size)
print(f'Tiny Shakespeare Metrics: Test Loss: {ts_loss:.4f}, Test Entropy: {ts_entropy:.4f}, Test Perplexity: {ts_perplexity:.2f}')

Tiny Shakespeare Metrics: Test Loss: 7.4555, Test Entropy: 10.7560, Test Perplexity: 1729.38


# Fin: Generate text samples from model

In [100]:
gpt2_tokenizer = tiktoken.get_encoding("gpt2")

In [91]:
def generate_text(model, context_window_size, seq_len=1000, num_iters=5):
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    model.eval()
    pad_id = 198                              # newline 'Ċ'
    tokens = torch.full((num_iters, context_window_size),
                        pad_id,
                        dtype=torch.long)

    for i in range(seq_len):
        inp_tokens = tokens[:, -context_window_size:] # (B, T)
        inp_tokens = inp_tokens.to(device)
        logits = model(inp_tokens).detach().cpu() # (B, V)
        probs = F.softmax(logits, dim=1) # (B, V)
        chosen_tokens = torch.multinomial(probs, num_samples=1)
        tokens = torch.cat([tokens, chosen_tokens], dim=1)

    generated = tokens[:, context_window_size:]
    text = gpt2_tokenizer.decode_batch(generated.numpy())
    return text

In [96]:
generated_text = generate_text(model, context_window_size=context_window_size, seq_len=100)

In [97]:
from IPython.display import display, Markdown

In [99]:
display(Markdown(generated_text[3]))

F: A?
 8
May Man �
 slices? What are choosing a huge environment:
Some theorying Pre? Well:
Studies, I don't be
the Cush pathogens with integeritude, and Limited, offering reducing prominence math types by removing theWacies for track scan. 12 schoolAT simulations, test. incorporated Fiction or Alpha suite is if you stand, and provide mainstream products, students, sufficient these level are this to ensure quit.
How You campaign advice embody