# CNNs for Language Modelling

This notebook explores the use of Convolutional Neural Nets (CNNs) for Language Modelling. This extends the Bengio et. al. (2003) paper by adding conv nets. 

**Reference Paper**: [Convolutional Neural Network Language Models](https://aclanthology.org/D16-1123.pdf)

In [1]:
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import tiktoken
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import _LRScheduler 
from torch.nn.utils import clip_grad_norm_ 
import random
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
%matplotlib inline
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f79782829b0>

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# 1. Read Data

In [3]:
def load_dataset_from_files(file_path):
    with open(file_path, "r") as f:
        str_tokens = f.read().splitlines()
        tokens = [int(token) for token in str_tokens]

    return tokens

In [4]:
train_tokens = load_dataset_from_files("train_tokens.txt")
val_tokens = load_dataset_from_files("val_tokens.txt")
test_tokens = load_dataset_from_files("test_tokens.txt")
ts_tokens = load_dataset_from_files("ts_tokens.txt")

In [5]:
len(train_tokens), len(test_tokens), len(val_tokens), len(ts_tokens)

(800258, 100033, 100032, 338025)

# 2. Helper Functions

In [6]:
gpt2_tokenizer = tiktoken.get_encoding("gpt2")

In [7]:
def prepare_dataset(tokens, context_window_size):
    x, y = [], []

    for i in range(len(tokens) - context_window_size):
        x.append(tokens[i : i + context_window_size])
        y.append(tokens[i + context_window_size])

    x = torch.LongTensor(x)
    y = torch.LongTensor(y)

    return x, y

In [8]:
def get_metrics(model, tokens, context_window_size):
    x_tensor, y_tensor = prepare_dataset(tokens, context_window_size)
    dataset = TensorDataset(x_tensor, y_tensor)
    dataloader = DataLoader(dataset, batch_size=4096, shuffle=True, drop_last=False)
    criterion = nn.CrossEntropyLoss()

    # get initial metrics
    model.eval()
    tmp_loss = 0.0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device) # (B, T), (B, )
        logits = model(x)
        loss = criterion(logits, y)
        tmp_loss += loss.item() * (x.shape[0] / x_tensor.shape[0])

    perplexity = float(np.exp(tmp_loss))
    entropy = float(np.log2(perplexity))

    return tmp_loss, entropy, perplexity

In [9]:
def cosine_scheduler(it, min_lr, max_lr, warmup_steps, max_steps, base_lr):
    if it < warmup_steps:
        lr = max_lr * ((it + 1) / warmup_steps)
    elif it > max_steps:
        lr = min_lr
    else:
        decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
        assert 0 <= decay_ratio <= 1
        coeff = 0.5 * (1 + np.cos(decay_ratio * np.pi)) # starts with 1, ends at 0
        lr = min_lr + coeff * (max_lr - min_lr)

    return lr / base_lr


In [10]:
def train(model, train_tokens, val_tokens, batch_size, num_epochs, lr, context_window_size, base_lr, max_lr, min_lr, warmup_steps, max_steps):
    x_train, y_train = prepare_dataset(train_tokens, context_window_size)
    x_val, y_val = prepare_dataset(val_tokens, context_window_size)

    train_dataset = TensorDataset(x_train, y_train)
    val_dataset = TensorDataset(x_val, y_val)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last=False)

    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: cosine_scheduler(epoch, min_lr=min_lr, max_lr=max_lr, warmup_steps=warmup_steps, max_steps=max_steps, base_lr=base_lr))
    # scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=lr * 0.10)
    criterion = nn.CrossEntropyLoss()

    metrics = {"NLL": [], "Entropy": [], "Perplexity": []}

    # get initial metrics
    model.eval()

    train_loss = 0.0
    for x, y in train_dataloader:
        x, y = x.to(device), y.to(device) # (B, T), (B, )
        logits = model(x)
        loss = criterion(logits, y)
        train_loss += loss.item() * (x.shape[0] / x_train.shape[0])

    train_perplexity = float(np.exp(train_loss))
    train_entropy = float(np.log2(train_perplexity))

    # eval
    model.eval()
    val_loss = 0.0
    for x, y in val_dataloader:
        x, y = x.to(device), y.to(device) # (B, T), (B, )
        logits = model(x)
        loss = criterion(logits, y)
        val_loss += loss.item() * (x.shape[0] / x_val.shape[0])

    val_perplexity = float(np.exp(val_loss))
    val_entropy = float(np.log2(val_perplexity))

    metrics["NLL"].append((train_loss, val_loss))
    metrics["Entropy"].append((train_entropy, val_entropy))
    metrics["Perplexity"].append((train_perplexity, val_perplexity))

    print(f"Start of training: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")


    # now start training

    for epoch in range(num_epochs):
        # train loop
        model.train()
        train_loss = 0.0
        for x, y in train_dataloader:
            x, y = x.to(device), y.to(device) # (B, T), (B, )
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            train_loss += loss.item() * (x.shape[0] / x_train.shape[0])

        train_perplexity = float(np.exp(train_loss))
        train_entropy = float(np.log2(train_perplexity))

        # eval
        model.eval()
        val_loss = 0.0
        for x, y in val_dataloader:
            x, y = x.to(device), y.to(device) # (B, T), (B, )
            logits = model(x)
            loss = criterion(logits, y)
            val_loss += loss.item() * (x.shape[0] / x_val.shape[0])

        val_perplexity = float(np.exp(val_loss))
        val_entropy = float(np.log2(val_perplexity))

        metrics["NLL"].append((train_loss, val_loss))
        metrics["Entropy"].append((train_entropy, val_entropy))
        metrics["Perplexity"].append((train_perplexity, val_perplexity))

        print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    return metrics

In [11]:
def get_metrics(model, tokens, context_window_size):
    x_tensor, y_tensor = prepare_dataset(tokens, context_window_size)
    dataset = TensorDataset(x_tensor, y_tensor)
    dataloader = DataLoader(dataset, batch_size=4096, shuffle=True, drop_last=False)
    criterion = nn.CrossEntropyLoss()

    # get initial metrics
    model.eval()
    tmp_loss = 0.0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device) # (B, T), (B, )
        logits = model(x)
        loss = criterion(logits, y)
        tmp_loss += loss.item() * (x.shape[0] / x_tensor.shape[0])

    perplexity = float(np.exp(tmp_loss))
    entropy = float(np.log2(perplexity))

    return tmp_loss, entropy, perplexity

In [12]:
def generate_text(model, context_window_size, seq_len=1000, num_iters=5):
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    model.eval()
    pad_id = 198                              # newline 'Ċ'
    tokens = torch.full((num_iters, context_window_size),
                        pad_id,
                        dtype=torch.long)

    for i in range(seq_len):
        inp_tokens = tokens[:, -context_window_size:] # (B, T)
        inp_tokens = inp_tokens.to(device)
        logits = model(inp_tokens).detach().cpu() # (B, V)
        probs = F.softmax(logits, dim=1) # (B, V)
        chosen_tokens = torch.multinomial(probs, num_samples=1)
        tokens = torch.cat([tokens, chosen_tokens], dim=1)

    generated = tokens[:, context_window_size:]
    text = gpt2_tokenizer.decode_batch(generated.numpy())
    return text

# 3.1: Bengio Paper with Highway Networks

As said in the paper, they enrich the model with highway networks. So we see if this improves our metrics

**Highway Networks**: [Highway Networks](https://arxiv.org/pdf/1505.00387)

In [15]:
class HighwayNetworks(nn.Module):
    def __init__(self, embedding_dim, num_layers):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.transformed_signal_network = nn.ModuleList([
            nn.Linear(embedding_dim, embedding_dim) for _ in range(num_layers)
        ])
        self.transformed_signal_bn = nn.ModuleList([
            nn.BatchNorm1d(embedding_dim) for _ in range(num_layers)
        ])
        self.transform_gate_network = nn.ModuleList([
            nn.Linear(embedding_dim, embedding_dim) for _ in range(num_layers)
        ])
        self.transform_gate_bn = nn.ModuleList([
            nn.BatchNorm1d(embedding_dim) for _ in range(num_layers)
        ])

        for net in self.transformed_signal_network:
            nn.init.xavier_normal_(net.weight)
            nn.init.zeros_(net.bias)
        for net in self.transform_gate_network:
            nn.init.xavier_normal_(net.weight)
            nn.init.zeros_(net.bias)

    def forward(self, x):
        # x dim: (B, C)
        for net1, bn1, net2, bn2 in zip(self.transformed_signal_network, self.transformed_signal_bn, self.transform_gate_network, self.transform_gate_bn):
            H = net1(x)
            H = bn1(x)
            H = F.relu(H) # (B, T*C)
            T = net2(x) 
            T = bn2(x)
            T = F.sigmoid(T) # (B, T*C)
            x = T * H + (1 - T) * x

        return x

In [105]:
class BengioLMHighwayDropout(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_window_size, dropout=0.0, weight_tying=False):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.context_window_size = context_window_size

        self.embedding_lookup_table = nn.Embedding(vocab_size, embedding_dim)
        self.dropout1 = nn.Dropout(p=dropout)
        self.linear1 = nn.Linear(embedding_dim * context_window_size, embedding_dim)
        self.bn1 = nn.BatchNorm1d(embedding_dim)
        self.highway = HighwayNetworks(embedding_dim, num_layers=1)
        self.dropout2 = nn.Dropout(p=dropout)
        self.linear2 = nn.Linear(embedding_dim, vocab_size)

        # init params
        nn.init.xavier_normal_(self.embedding_lookup_table.weight)
        nn.init.xavier_normal_(self.linear1.weight)
        nn.init.zeros_(self.linear1.bias)
        nn.init.xavier_normal_(self.linear2.weight)
        nn.init.zeros_(self.linear2.bias)

        if weight_tying:
            self.embedding_lookup_table.weight = self.linear2.weight

    def forward(self, x):
        # x shape: (B, T)
        embeddings = self.embedding_lookup_table(x) # (B, T, C)
        embeddings = self.dropout1(embeddings)
        B, T, C = embeddings.shape
        embeddings = embeddings.view(B, T * C)

        h = self.linear1(embeddings) # (B, C)
        h = self.bn1(h)
        h = F.relu(h)
        h = self.highway(h) # (B, C)
        h = self.dropout2(h) # (B, C)

        logits = self.linear2(h) # (B, V)

        return logits

In [25]:
vocab_size = max(train_tokens) + 1
vocab_size

50257

## Exp. 1: No Dropout + smaller embedding_dim + no weight tie

- `embedding_dim`: 128
- `window_size`: 16

In [95]:
embedding_dim = 128
context_window_size = 16

In [96]:
model = BengioLMHighwayDropout(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim)

In [97]:
model.to(device)

BengioLMHighwayDropout(
  (embedding_lookup_table): Embedding(50257, 128)
  (dropout1): Dropout(p=0.0, inplace=False)
  (linear1): Linear(in_features=2048, out_features=128, bias=True)
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (highway): HighwayNetworks(
    (transformed_signal_network): ModuleList(
      (0): Linear(in_features=128, out_features=128, bias=True)
    )
    (transformed_signal_bn): ModuleList(
      (0): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (transform_gate_network): ModuleList(
      (0): Linear(in_features=128, out_features=128, bias=True)
    )
    (transform_gate_bn): ModuleList(
      (0): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (dropout2): Dropout(p=0.0, inplace=False)
  (linear2): Linear(in_features=128, out_features=50257, bias=True)
)

In [98]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model: {total_params}")

Total parameters in model: 13212113


In [99]:
(len(train_tokens) / 4096) * 5

976.87744140625

In [100]:
base_lr = 1e-3
max_lr = 1e-3
min_lr = max_lr * 0.01
warmup_steps = 50
max_steps = 500

metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=5, lr=base_lr, context_window_size=context_window_size,\
                  base_lr=base_lr, max_lr=max_lr, min_lr=min_lr, warmup_steps=warmup_steps, max_steps=max_steps)

Start of training: Train Loss: 10.8249, Val Loss: 10.8249
Epoch 1: Train Loss: 8.0440, Val Loss: 6.8911
Epoch 2: Train Loss: 6.3218, Val Loss: 6.6711
Epoch 3: Train Loss: 5.9790, Val Loss: 6.6695
Epoch 4: Train Loss: 5.9473, Val Loss: 6.6697
Epoch 5: Train Loss: 5.9330, Val Loss: 6.6699


In [101]:
train_loss, train_entropy, train_perplexity = get_metrics(model, train_tokens, context_window_size)
print(f'Train Metrics: Train Loss: {train_loss:.4f}, Train Entropy: {train_entropy:.4f}, Train Perplexity: {train_perplexity:.2f}')

Train Metrics: Train Loss: 5.9225, Train Entropy: 8.5443, Train Perplexity: 373.33


In [102]:
val_loss, val_entropy, val_perplexity = get_metrics(model, val_tokens, context_window_size)
print(f'Val Metrics: Val Loss: {val_loss:.4f}, Val Entropy: {val_entropy:.4f}, Val Perplexity: {val_perplexity:.2f}')

Val Metrics: Val Loss: 6.6699, Val Entropy: 9.6226, Val Perplexity: 788.29


In [103]:
test_loss, test_entropy, test_perplexity = get_metrics(model, test_tokens, context_window_size)
print(f'Test Metrics: Test Loss: {test_loss:.4f}, Test Entropy: {test_entropy:.4f}, Test Perplexity: {test_perplexity:.2f}')

Test Metrics: Test Loss: 6.8268, Test Entropy: 9.8490, Test Perplexity: 922.22


In [104]:
ts_loss, ts_entropy, ts_perplexity = get_metrics(model, ts_tokens, context_window_size)
print(f'Tiny Shakespeare Metrics: Test Loss: {ts_loss:.4f}, Test Entropy: {ts_entropy:.4f}, Test Perplexity: {ts_perplexity:.2f}')

Tiny Shakespeare Metrics: Test Loss: 7.2134, Test Entropy: 10.4067, Test Perplexity: 1357.47


## Exp. 2: No Dropout + smaller embedding_dim + weight tie

- `embedding_dim`: 128
- `window_size`: 16

In [167]:
embedding_dim = 128
context_window_size = 16

In [168]:
model = BengioLMHighwayDropout(vocab_size=vocab_size, context_window_size=context_window_size, embedding_dim=embedding_dim, weight_tying=True)

In [169]:
model.to(device)

BengioLMHighwayDropout(
  (embedding_lookup_table): Embedding(50257, 128)
  (dropout1): Dropout(p=0.0, inplace=False)
  (linear1): Linear(in_features=2048, out_features=128, bias=True)
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (highway): HighwayNetworks(
    (transformed_signal_network): ModuleList(
      (0): Linear(in_features=128, out_features=128, bias=True)
    )
    (transformed_signal_bn): ModuleList(
      (0): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (transform_gate_network): ModuleList(
      (0): Linear(in_features=128, out_features=128, bias=True)
    )
    (transform_gate_bn): ModuleList(
      (0): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (dropout2): Dropout(p=0.0, inplace=False)
  (linear2): Linear(in_features=128, out_features=50257, bias=True)
)

In [170]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model: {total_params}")

Total parameters in model: 6779217


In [171]:
base_lr = 2e-3
max_lr = 2e-3
min_lr = max_lr * 0.005
warmup_steps = 100
max_steps = 1000

metrics1 = train(model, train_tokens, val_tokens, batch_size=4096, num_epochs=3, lr=base_lr, context_window_size=context_window_size,\
                  base_lr=base_lr, max_lr=max_lr, min_lr=min_lr, warmup_steps=warmup_steps, max_steps=max_steps)

Start of training: Train Loss: 10.8249, Val Loss: 10.8249
Epoch 1: Train Loss: 8.1074, Val Loss: 6.8830
Epoch 2: Train Loss: 6.2607, Val Loss: 6.5313
Epoch 3: Train Loss: 5.6692, Val Loss: 6.4632


In [172]:
(model.embedding_lookup_table.weight == model.linear2.weight).all()

tensor(True, device='cuda:0')

In [173]:
train_loss, train_entropy, train_perplexity = get_metrics(model, train_tokens, context_window_size)
print(f'Train Metrics: Train Loss: {train_loss:.4f}, Train Entropy: {train_entropy:.4f}, Train Perplexity: {train_perplexity:.2f}')

Train Metrics: Train Loss: 5.2638, Train Entropy: 7.5941, Train Perplexity: 193.22


In [174]:
val_loss, val_entropy, val_perplexity = get_metrics(model, val_tokens, context_window_size)
print(f'Val Metrics: Val Loss: {val_loss:.4f}, Val Entropy: {val_entropy:.4f}, Val Perplexity: {val_perplexity:.2f}')

Val Metrics: Val Loss: 6.4632, Val Entropy: 9.3244, Val Perplexity: 641.10


In [175]:
test_loss, test_entropy, test_perplexity = get_metrics(model, test_tokens, context_window_size)
print(f'Test Metrics: Test Loss: {test_loss:.4f}, Test Entropy: {test_entropy:.4f}, Test Perplexity: {test_perplexity:.2f}')

Test Metrics: Test Loss: 6.6800, Test Entropy: 9.6372, Test Perplexity: 796.30


In [176]:
ts_loss, ts_entropy, ts_perplexity = get_metrics(model, ts_tokens, context_window_size)
print(f'Tiny Shakespeare Metrics: Test Loss: {ts_loss:.4f}, Test Entropy: {ts_entropy:.4f}, Test Perplexity: {ts_perplexity:.2f}')

Tiny Shakespeare Metrics: Test Loss: 7.2875, Test Entropy: 10.5136, Test Perplexity: 1461.91
