In [None]:
# Testing
'''
import torch
from torch import nn
import math

head_sz = 8
embed_sz = 32
token_unique_n = 8
block_sz = 4

head_sz = head_sz
embed_sz = embed_sz
head_n = embed_sz / head_sz
block_sz = block_sz

char_embed = nn.Embedding(token_unique_n, embed_sz)
pos_embed = nn.Embedding(token_unique_n, embed_sz)

q = nn.Linear(embed_sz, head_sz, bias = False)
k = nn.Linear(embed_sz, head_sz, bias = False)
v = nn.Linear(embed_sz, head_sz, bias = False)
softmax = nn.Softmax(dim = 1)
mask = torch.ones((block_sz, block_sz)).triu()

inp = torch.tensor([[2, 7, 0, 1], [1, 1, 1, 3], [7, 0, 0, 1], [5, 5, 0, 1], [0, 0, 2, 6]])

###### Forward

data = torch.stack([torch.stack([char_embed(i[j]) + pos_embed(torch.tensor(j)) for j in range(len(i))]) for i in inp])

query = q(data)
key = k(data)
value = v(data)

attention = softmax(query @ key.transpose(1,2) / math.sqrt(head_sz) * mask.repeat((len(inp), 1, 1))) @ value

'''

a = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
a = torch.tril(a) + torch.triu(torch.ones_like(a) * float('-inf'), diagonal = 1)

print(a)

tensor([[1., -inf, -inf],
        [4., 5., -inf],
        [7., 8., 9.]])


In [None]:
# Testing vanilla Transformer architecture for Autoregressive Language Generation

In [1]:
!pip install portalocker

Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.7.0


In [2]:
import torch
from torchtext import datasets
from torch import nn
import random
import math
import numpy as np
import tqdm

In [3]:
# Architecture hyperparams
special_char = ['\t', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '_', '{', '|', '}', "\n"]

num_chars = 93
train_batch_sz = 32
macro_batch_sz = 8 # For Gradient Accumulation to train with larger batch sizes without overflowing memory
truncate_len = 512

head_sz = 32
head_n = 12
embed_sz = 324
layer_n = 12
fcnn_sz = 1536
# dropout_p = 0.3

cycles = 100000

# learning rate
min_lr = 6e-5
max_lr = 6e-4

# Epochs to train model for (each epoch loops through the corpus once)
epochs = 50

# Convert character to index
def char_index(x):
  if ord(x) < 91 and ord(x) > 64:
    return ord(x) - 65
  if ord(x) < 123 and ord(x) > 96:
    return ord(x) - 71
  return special_char.index(x) + 52

# Filter characters in input data for relevant characters
def keep_char(x):
  return (ord(x) < 91 and ord(x) > 64) or (ord(x) < 123 and ord(x) > 96) or x in special_char

# Convert index to character
def index_char(ind):
  if ind < 26:
    return chr(ind + 65)
  if ind < 52:
    return chr(ind + 71)
  return special_char[ind - 52]

In [4]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f'Using Device {device}')

Using Device cuda


In [5]:
class SelfAttentionHead(nn.Module):
  def __init__(self, head_sz, token_unique_n, embed_sz):
    super().__init__()
    self.head_sz = head_sz
    self.embed_sz = embed_sz

    self.q = nn.Linear(embed_sz, head_sz, bias = False)
    self.k = nn.Linear(embed_sz, head_sz, bias = False)
    self.v = nn.Linear(embed_sz, head_sz, bias = False)

  def forward(self, inp):
    query = self.q(inp)
    key = self.k(inp)
    value = self.v(inp)

    attention = query @ key.transpose(1,2) / math.sqrt(self.head_sz)
    attention = torch.tril(attention) + torch.triu(torch.ones_like(attention) * float('-inf'), diagonal = 1)
    attention = nn.functional.softmax(attention, dim = -1) @ value
    return attention

class MultiHeadSelfAttention(nn.Module):
  def __init__(self, head_sz, head_n, token_unique_n, embed_sz):
    super().__init__()
    self.head_sz = head_sz
    self.embed_sz = embed_sz
    self.head_n = head_n

    self.heads = nn.ModuleList([SelfAttentionHead(head_sz, token_unique_n, embed_sz).to(device) for _ in range(int(head_n))])
    self.out = nn.Linear(int(self.head_n) * head_sz, embed_sz)

  def forward(self, inp):
    return self.out(torch.cat([head(inp) for head in self.heads], dim = -1))

class TransformerBlock(nn.Module):
  def __init__(self, head_sz, head_n, token_unique_n, embed_sz, fcnn_sz):
    super().__init__()
    self.head_sz = head_sz
    self.embed_sz = embed_sz
    self.head_n = head_n

    self.attention = MultiHeadSelfAttention(head_sz, head_n, token_unique_n, embed_sz).to(device)
    self.layernorm1 = nn.LayerNorm(embed_sz)
    self.fc = nn.Sequential(
        nn.Linear(embed_sz, fcnn_sz),
        nn.GELU(),
        nn.Linear(fcnn_sz, embed_sz)
    )
    self.layernorm2 = nn.LayerNorm(embed_sz)

  def forward(self, inp):
    inp = inp + self.attention(self.layernorm1(inp))
    inp = inp + self.fc(self.layernorm2(inp))

    return inp

class TransformerModel(nn.Module):
  def __init__(self, head_sz, head_n, token_unique_n, block_sz, embed_sz, layer_n, fcnn_sz):
    super().__init__()
    self.char_embed = nn.Embedding(token_unique_n, embed_sz)
    self.pos_embed = nn.Embedding(block_sz, embed_sz)

    self.model = nn.Sequential(
      *[TransformerBlock(head_sz, head_n, token_unique_n, embed_sz, fcnn_sz).to(device) for _ in range(layer_n)]
    )

    self.out_layernorm = nn.LayerNorm(embed_sz),
    self.out = nn.Linear(embed_sz, token_unique_n, bias = False)

    self.char_embed.weight = self.out.weight # Weight tie the vocabulary embedding weights and the output projection weights

  def forward(self, inp):
    data = torch.stack([torch.stack([self.char_embed(i[j]) + self.pos_embed(torch.tensor(j).to(device)) for j in range(len(i))]) for i in inp])
    data = self.model(data)
    return self.out(data)


In [None]:
# Model Initialization
def init_weights(m):
  if type(m) is nn.Linear:
    nn.init.xavier_normal_(m.weight)
    if m.bias is not None:
      torch.nn.init.zeros_(m.bias)

  if type(m) is nn.Embedding:
    nn.init.xavier_normal_(m.weight)


def construct_param_groups(m):
  if type(m) is nn.Linear and m.out_features != num_chars:
    weight_decay.append(m.weight)
    if m.bias is not None:
      no_weight_decay.append(m.bias)
  if type(m) is nn.Embedding:
    no_weight_decay.append(m.weight)
  if type(m) is nn.LayerNorm:
    no_weight_decay.append(m.weight)

weight_decay = []
no_weight_decay = []

optim_groups = [
    {"params": weight_decay, "weight_decay": 0.1},
    {"params": no_weight_decay, "weight_decay": 0}
]

model = TransformerModel(head_sz, head_n, num_chars, truncate_len, embed_sz, layer_n, fcnn_sz).to(device)
model.apply(init_weights)
model.apply(construct_param_groups)
opt = torch.optim.AdamW(optim_groups, lr = 3e-5, betas = (0.9, 0.95))

print(f'Parameter Number: {sum(p.numel() for p in model.parameters())}')

In [6]:
# Load model from gdrive
from google.colab import drive
drive.mount('/content/gdrive')

def construct_param_groups(m):
  if type(m) is nn.Linear and m.out_features != num_chars:
    weight_decay.append(m.weight)
    if m.bias is not None:
      no_weight_decay.append(m.bias)
  if type(m) is nn.Embedding:
    no_weight_decay.append(m.weight)
  if type(m) is nn.LayerNorm:
    no_weight_decay.append(m.weight)

weight_decay = []
no_weight_decay = []

optim_groups = [
    {"params": weight_decay, "weight_decay": 0.1},
    {"params": no_weight_decay, "weight_decay": 0}
]

model = TransformerModel(head_sz, head_n, num_chars, truncate_len, embed_sz, layer_n, fcnn_sz).to(device)
model.apply(construct_param_groups)
opt = torch.optim.AdamW(optim_groups, lr = 2e-6, betas = (0.9, 0.95))
model.load_state_dict(torch.load("/content/gdrive/My Drive/TransformerLM/state.pth"))
opt.load_state_dict(torch.load("/content/gdrive/My Drive/TransformerLM/opt.pth"))

print(f'Parameter Number: {sum(p.numel() for p in model.parameters())}')

Mounted at /content/gdrive
Parameter Number: 18153684


In [7]:
# Training data (EnWik9 database in torchtext)
training_dataloader = iter(datasets.EnWik9(root = "data"))

# Aggregate Data into tensor
data = ""
line_n = 0

with tqdm.tqdm(total = 3000000) as t:
  for x in training_dataloader:
    if line_n == 3000000:
      break
    for char in x:
      if keep_char(char):
        data += char

    data += "\n"

    line_n += 1
    t.update(1)

100%|██████████| 3000000/3000000 [07:10<00:00, 6974.02it/s]


In [8]:
# Generate sample
def sample(sz):
  idx = random.randint(0, len(data) - sz - 2)
  # idx = 0
  return [char_index(data[i]) for i in range(idx, idx + sz + 1)]

# Generate batched sample
def get_batch(batch_sz):
  return [sample(truncate_len) for _ in range(batch_sz)]

def prompt_sample(prompt):
  return [char_index(i) for i in prompt]

In [14]:
# Sampling hyperparameters
sample_model = True
sample_from_prompt = False
prompt = "The most interesting "

sample_topk = 5
sample_freq = 50

# Number of characters to sample from model each testing cycle
sample_length = 1024
sample_context = 64
sample_temp = 0.7

# Saving frequency
save_freq = 25

def train_cycle():
  model.zero_grad()

  for train_cycle in range(cycles):
    if train_cycle % sample_freq == 0 and sample_model:
      test_cycle()

    model.train()

    print(f'Train Cycle {train_cycle}')
    loss = 0
    accumulated_loss = 0

    for _ in range(macro_batch_sz):
      data = np.array(get_batch(train_batch_sz))
      expected = torch.tensor(data[:, 1:]).to(device)
      data = torch.tensor(data[:, :-1]).to(device)
      output = model(data)

      loss = nn.functional.cross_entropy(torch.reshape(output, (-1, num_chars)), torch.reshape(expected, (-1, ))) / macro_batch_sz
      accumulated_loss += loss
      loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    opt.step()
    model.zero_grad()

    print(f'Iteration Loss: {round(accumulated_loss.item(), 3)}')

    if train_cycle % save_freq == save_freq - 1:
      save_model()

def test_cycle():
  print('================================================== Test Cycle ==================================================')
  model.eval()

  # Generate a sample to serve as context for the model
  if sample_from_prompt:
    context = [prompt_sample(prompt)]
  else:
    context = [sample(sample_context)]

  for i in context[0]:
    print(index_char(i), end = '')

  for i in range(sample_length):
    output_distribution = model(torch.tensor(context).to(device))[0][-1]
    top_chars = torch.topk(output_distribution, sample_topk)
    sampled_char = top_chars[1][list(torch.utils.data.WeightedRandomSampler(nn.functional.softmax(top_chars[0] * sample_temp, dim = 0), 1))[0]].item()

    context[0].append(sampled_char)
    if len(context[0]) > truncate_len:
      context[0] = context[0][1:]
    print(index_char(sampled_char), end = '')

  print()
  print('================================================================================================================')

def save_model():
  PATH = "/content/gdrive/My Drive/TransformerLM/state.pth"
  torch.save(model.state_dict(), PATH)
  PATH = "/content/gdrive/My Drive/TransformerLM/opt.pth"
  torch.save(opt.state_dict(), PATH)

In [10]:
for g in opt.param_groups:
  g['lr'] = 3e-5

print(opt)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.95)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 3e-05
    maximize: False
    weight_decay: 0.1

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.95)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 3e-05
    maximize: False
    weight_decay: 0
)


In [None]:
# Training cycles
for epoch in range(epochs):
  print(f"Epoch {epoch}")
  train_cycle()

Epoch 0
 the maker welds a thin rod to the end of the blade at the crossgored ones of the side, which would never through the parents of allight at time.

The third is another offee of a [[policy]] only invalid body, while these criticizer and the [[southwart]] to true the counservational asks, warmly the televised one. Intended, in this, as they also well trying in the penant to pen to the south ones, withdrings were tryings. Truck is survival. In people a spen with someone's truck assumed. The peninuality cruce, this would shoot be reply overlow the city at their, as to still of.

Trucking a single asks with polemest took as a cashing offershiving personnallization in [[1945 in the Control|Control]] issued in [[1949]], though would be a prophet on in the society, and the country. Television islet, the penanth spenten of it, we consided the so

In [None]:
|from google.colab import drive
drive.mount('/content/gdrive')

PATH = "/content/gdrive/My Drive/TransformerLM/state.pth"
torch.save(model.state_dict(), PATH)
PATH = "/content/gdrive/My Drive/TransformerLM/opt.pth"
torch.save(opt.state_dict(), PATH)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
