In [1]:
# Read the text file
with open('dataset/merged-transcriptions.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# split text lines into random 90% train and 10% val splits
text_lines = text.split('\n')
import numpy as np
np.random.seed(0)
trn_inds = np.sort(np.random.choice(len(text_lines), int(0.9*len(text_lines)), replace=False))
val_inds = np.setdiff1d(np.arange(len(text_lines)), trn_inds)
trn_text = '\n'.join([text_lines[i] for i in trn_inds])
val_text = '\n'.join([text_lines[i] for i in val_inds])

# Preprocess the text to make spaces and newlines explicit
import re
def preprocess(txt):
    return re.sub(r' +', ' [SPACE] ', txt).replace('\n', ' [NEWLINE] ')

trn_text = preprocess(trn_text)
val_text = preprocess(val_text)

In [2]:
# Train a BPE tokenizer on the given text
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[SPACE]", "[NEWLINE]"], vocab_size=2000)
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train_from_iterator([text], trainer=trainer)
tokenizer.save("tokenizer.json")
VOCAB_SIZE = tokenizer.get_vocab_size()






In [3]:
# visualize sample encode and decode operations
encode = lambda s: tokenizer.encode(preprocess(s)).ids # encoder: take a string, output a list of integers
decode = lambda l: tokenizer.decode(l, skip_special_tokens=False).replace(' ', '').replace('[SPACE]', ' ').replace('[NEWLINE]', '\n') # decoder: take a list of integers, output a string

print(encode("hii there\nwassup"))
print(decode(encode("hii there\nwassup")))

[89, 62, 5, 287, 6, 76, 295, 121]
hii there
wassup


In [4]:
# encode all of the text data
import torch
trn_data = torch.tensor(tokenizer.encode(trn_text).ids)
val_data = torch.tensor(tokenizer.encode(val_text).ids)
print(trn_data.shape, trn_data.dtype)
print(decode(trn_data[:100].tolist()))

torch.Size([430795]) torch.int64
WEBVTT

1
00:05:40.410 --> 00:05:42.240
Inderjit Dhillon: Hey? Good afternoon.

2
00:06:02.520 --> 00:06:04.500
Inderjit Dhillon: Just setting my ipad up

00:06:45.490 --> 00:06:47.000


In [5]:
# Visualizing how the training data looks like
block_size = 8
trn_data[:block_size+1]

x = trn_data[:block_size]
y = trn_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([50]) the target: 32
when input is tensor([50, 32]) the target: 29
when input is tensor([50, 32, 29]) the target: 49
when input is tensor([50, 32, 29, 49]) the target: 47
when input is tensor([50, 32, 29, 49, 47]) the target: 47
when input is tensor([50, 32, 29, 49, 47, 47]) the target: 6
when input is tensor([50, 32, 29, 49, 47, 47,  6]) the target: 6
when input is tensor([50, 32, 29, 49, 47, 47,  6,  6]) the target: 14


In [6]:
torch.manual_seed(1337)
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = self.data[idx:idx+self.block_size]
        y = self.data[idx+1:idx+self.block_size+1]
        return x, y
    
BATCH_SIZE = 128 # how many sequences to process in parallel
BLOCK_SIZE = 32 # how many tokens to consider at once

trn_dataset = Dataset(trn_data, BLOCK_SIZE)
val_dataset = Dataset(val_data, BLOCK_SIZE)

trn_loader = torch.utils.data.DataLoader(trn_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

In [7]:
import torch, math
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

In [8]:
# Multi Head Attention Implementation
class MultiHeadAttentionOp(nn.Module):
    def __init__(self, dim, num_heads, causal=True):
        super().__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.causal = causal

    def forward(self, q, k, v):
        # q, k, v: [batch_size, seq_len, dim]
        batch_size, seq_len, dim = q.shape
        assert dim == self.dim
        assert dim % self.num_heads == 0
        head_dim = dim // self.num_heads

        # split the dim into multiple heads
        q = q.view(batch_size, seq_len, self.num_heads, head_dim)
        k = k.view(batch_size, seq_len, self.num_heads, head_dim)
        v = v.view(batch_size, seq_len, self.num_heads, head_dim)

        # compute the scaled dot product attention
        q = q / torch.sqrt(torch.tensor(dim, dtype=torch.float))
        scores = torch.einsum('bqhd,bkhd->bhqk', q, k)

        if self.causal:
            # mask out the future tokens
            mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
            scores.masked_fill_(mask, float('-inf'))

        attn = F.softmax(scores, dim=-1)
        out = torch.einsum('bhqk,bkhd->bqhd', attn, v)
        return out.reshape(batch_size, seq_len, dim)

class Attention(nn.Module):
    def __init__(self, dim, num_heads):
        super().__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.Wq = nn.Linear(dim, dim)
        self.Wk = nn.Linear(dim, dim)
        self.Wv = nn.Linear(dim, dim)
        self.Wo = nn.Linear(dim, dim)
        self.attn = MultiHeadAttentionOp(dim, num_heads, causal=True)

    def forward(self, q, k, v):
        # x: [batch_size, seq_len, dim]
        out = self.attn(self.Wq(q), self.Wk(k), self.Wv(v))
        out = self.Wo(out)
        return out

In [9]:
# Transformer Layer
class Layer(nn.Module):
    def __init__(self, emb_dim, hidden_dim, num_heads, dropout):
        super().__init__()
        self.attention = Attention(emb_dim, num_heads)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.norm2 = nn.LayerNorm(emb_dim)
        self.ff = nn.Sequential(
            nn.Linear(emb_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, emb_dim)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.dropout(self.attention(x, x, x))
        x = self.norm1(x)
        x = x + self.dropout(self.ff(x))
        x = self.norm2(x)
        return x

In [10]:
class Transformer(nn.Module):
    def __init__(self, vocab_dim, emb_dim, hidden_dim, n_layers, num_heads, dropout):
        super(Transformer, self).__init__()
        self.config = {
            'vocab_dim': vocab_dim,
            'emb_dim': emb_dim,
            'hidden_dim': hidden_dim,
            'n_layers': n_layers,
            'num_heads': num_heads,
            'dropout': dropout
        }
        # initialize the embedding layers
        self.vocab_emb = nn.Embedding(vocab_dim, emb_dim)
        # initialize the positional embeddings
        self.pos_emb = nn.Embedding(100, emb_dim)
        # initialize the transformer layers
        self.layers = nn.Sequential(*[Layer(emb_dim, hidden_dim, num_heads, dropout) for _ in range(n_layers)])
        # initialize the language model head
        self.lm_head = nn.Linear(emb_dim, vocab_dim)

    def forward(self, x, y):
        ### YOUR CODE HERE ###
        # x: input sequence, tensor of shape [batch_size, seq_len]
        # y: target sequence, tensor of shape [batch_size, seq_len] or None
        # Returns: tensor of shape [batch_size, seq_len, vocab_dim] if y is None otherwise floating loss value
        
        # embed the input sequence
        # add position embeddings to the input sequence
        # pass the input sequence through the transformer layers
        # pass the output of the transformer layers through the language model head
        # if y is None return the logits, otherwise return the cross entropy loss value
        embedding = self.vocab_emb(x)
        pos = torch.arange(x.size(1)).repeat(x.size(0), 1)
        embedding += self.pos_emb(pos)
        logits = self.lm_head(self.layers(embedding))
        if (y is not None):
            loss = nn.CrossEntropyLoss()
            return loss(logits.view(-1, logits.size(-1)), y.view(-1))
        else:
            return logits
            
        

def generate(net, max_new_tokens=2000):
    net.eval()
    # start with zero token
    context = torch.tensor([[0]])
    # generate new tokens upto max_new_tokens
    for _ in tqdm(range(max_new_tokens)):
        # compute block_context i.e. the last BLOCK_SIZE tokens
        block_context = context[:, -BLOCK_SIZE:]
        # compute logits, probabilities for the next token
        logits = net(block_context, None)
        probs = F.softmax(logits[:, -1, :], dim=-1)
        # sample the next token from the probabilities
        new_token = torch.multinomial(probs, 1)
        # append the new token to the context
        context = torch.cat([context, new_token], dim=-1)
    return context

In [11]:
EMB_DIM = 64
HIDDEN_DIM = EMB_DIM * 4
N_LAYERS = 4
NUM_HEADS = 4
DROPOUT = 0

net = Transformer(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, N_LAYERS, NUM_HEADS, DROPOUT)

In [12]:
# print the number of parameters in the model
print(sum(p.numel() for p in net.parameters())/1e6, 'M parameters', sep='')

0.464336M parameters


In [132]:
# Initial sample from the model
out = generate(net)
print(decode(out[0].tolist()))

100%|██████████| 2000/2000 [00:08<00:00, 222.56it/s]

[UNK]interesting219fullyquiteUensiveonecleslandisboardfinesimilarguagesdalgorithmthankgradwithinoptimalnearctually%wordsli630ain729860takingparametersclassifconnectionsmoving500oreranZengqiChitrank28spartechniqufollowingprocesswritterialstartedche107culTheseteespeciallynaturalChearsubideoutsedbeingoperation230aterpli509179mareff20PtorchricratThankMa216cent62chitrank248inputeithertamatotmethodstechniquhopeassoirq64averagelogone258asingerebasicallybecomecomingimalickprocessproduct189699beotsdglowhyentialcingiddoesndonsequmeperceptoncelinkswhetherdimensGaocalculstillprimalAllackizinghalf13ativeir480encoderdividedrorwrittendownseparatingvelodecreaseasecomptech369460giancedteaproductsectalgorithconvolutionsolveivtilleigenvideo01gammaward920dimensionalohick360relfunctionvisualclusteringckitrankLetishvel110reallyser237DelstepsWill529racyLsddetailslargebecomesguagetheyitiontodayitionimages289wrgrapairtensorspical249token179whole319wordsJusttrainGaoimalpuslow5Liupast770oncectionMyralullers[UNK]




In [29]:
# Training parameters
LR = 3e-3
NUM_EPOCHS = 1
MAX_STEPS = 10000 # max number of steps to train for, change this to smaller value for debugging

optim = torch.optim.AdamW(net.parameters(), lr=LR, weight_decay=0.01)

In [30]:
def train(net, optim, trn_loader):
    global_step = 0
    training_loss = 0
    for epoch in range(NUM_EPOCHS):
        t = tqdm(trn_loader, desc='Epoch: ?, Loss: ?')
        running_loss = 0.0
        for i, (xb, yb) in enumerate(t):
            net.train()
            loss = net(xb, yb)
            loss.backward()
            optim.step()
            optim.zero_grad()
            running_loss += loss.item()
            global_step += 1
            t.set_description(f'Epoch: {epoch}/{NUM_EPOCHS}, Loss: {running_loss/(i+1):.3f}')
            if global_step == MAX_STEPS:
                break
        training_loss = running_loss/len(trn_loader)
    print(f'Training loss: {training_loss:.3f}')
    return training_loss

training_loss = train(net, optim, trn_loader)

Epoch: 0/1, Loss: 1.737: 100%|██████████| 3366/3366 [14:32<00:00,  3.86it/s]

Training loss: 1.737





In [32]:
# calulate validation loss
def eval(net, val_loader):
    validation_loss = 0
    net.eval()
    with torch.no_grad():
        val_loss = 0.0
        for xb, yb in tqdm(val_loader):
            loss = net(xb, yb)
            val_loss += loss.item()
        validation_loss = val_loss/len(val_loader)
        print(f'Validation loss: {validation_loss:.3f}')
    return validation_loss

validation_loss = eval(net, val_loader)

100%|██████████| 381/381 [00:31<00:00, 12.23it/s]

Validation loss: 2.111





In [33]:
out = generate(net)
print(decode(out[0].tolist()))

100%|██████████| 2000/2000 [00:10<00:00, 189.74it/s]

[UNK]ity lood.

596
01:09:17.720 --> 01:09:23.479
Inderjit Dhillon: optimality. Remember that that we used? The amount of work is unlabeled on April fit into Alexity.

637
01:07:02.780 --> 01:07:04.640
Nilesh Gupta: So and one.

619
01:07:06.450 --> 01:07:17.570
Nilesh Gupta: But there are other sets that we learned in it could actually have a task that network detail is being well. And now that I will
584
Inderjit Dhillon: I conditive

561
00:56:56.089
Inderjit Dhillon: it's able to solve this or convolutional neural networks where somewhat really terc one.

362
00:36:55.220 --> 00:36:12.650
Inderjit Dhillon: Right, but it got it in machine learning and by quite a very computation, value is

00:05:02.610 --> 00:05:07.460
Inderjit Dhillon: being well, we and inequalities on.

77
00:05:07.730 --> 00:05:12.980
Nilesh Gupta: and computing the residual
299
00:05:14.690 --> 00:05:40.830
Nilesh Gupta: like what is computation one machine learning as work so like, let me do a trouble, like it




In addition to implementing the forward pass of the transformer, you will also need to tweak different hyperparameters of the model, such as the number of heads, the number of layers, the embedding dimension, and the block size. You will then need to report the training and validation loss with different hyperparameters. You can try 3 different values for each hyperparameter (one below the default value and one above) independently and report the results in a table.

In [46]:
def table(param_name, samples, training_losses, validation_losses):
    print ("{:<20} {:<20} {:<20}".format(param_name,'Training Loss','Validation Loss'))
    for i in range(len(samples)):
        print ("{:<20} {:<20} {:<20}".format(samples[i], training_losses[i], validation_losses[i]))

heads = [2, 4, 8]
layers = [2, 4, 8]
emb_dims = [32, 64, 128]
block_sizes = [16, 32, 64]

In [52]:
training_losses = []
validation_losses = []
for num_heads in heads:
    net = Transformer(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, N_LAYERS, num_heads, DROPOUT)
    optim = torch.optim.AdamW(net.parameters(), lr=LR, weight_decay=0.01)
    training_losses.append(train(net, optim, trn_loader))
    validation_losses.append(eval(net, val_loader))
table("Number of Heads", heads, training_losses, validation_losses)

Epoch: 0/1, Loss: 1.710: 100%|██████████| 3366/3366 [33:04<00:00,  1.70it/s] 


Training loss: 1.710


100%|██████████| 380/380 [01:13<00:00,  5.20it/s]


Validation loss: 2.148


Epoch: 0/1, Loss: 1.755: 100%|██████████| 3366/3366 [33:38<00:00,  1.67it/s]


Training loss: 1.755


100%|██████████| 380/380 [01:14<00:00,  5.12it/s]


Validation loss: 2.119


Epoch: 0/1, Loss: 1.758: 100%|██████████| 3366/3366 [39:16<00:00,  1.43it/s]


Training loss: 1.758


100%|██████████| 380/380 [01:41<00:00,  3.75it/s]

Validation loss: 2.151
Number of Heads      Training Loss        Validation Loss     
2                    1.7100489361967561   2.147502196462531   
4                    1.7545494494945324   2.1188209467812587  
8                    1.7579337354224847   2.150640647976022   





In [48]:
training_losses = []
validation_losses = []
for num_layers in layers:
    net = Transformer(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, num_layers, NUM_HEADS, DROPOUT)
    optim = torch.optim.AdamW(net.parameters(), lr=LR, weight_decay=0.01)
    training_losses.append(train(net, optim, trn_loader))
    validation_losses.append(eval(net, val_loader))
table("Number of Layers", layers, training_losses, validation_losses)

Epoch: 0/1, Loss: 1.834: 100%|██████████| 3366/3366 [09:23<00:00,  5.97it/s]


Training loss: 1.834


100%|██████████| 381/381 [00:22<00:00, 16.78it/s]


Validation loss: 2.090


Epoch: 0/1, Loss: 1.810: 100%|██████████| 3366/3366 [14:38<00:00,  3.83it/s]


Training loss: 1.810


100%|██████████| 381/381 [00:32<00:00, 11.66it/s]


Validation loss: 2.101


Epoch: 0/1, Loss: 1.804: 100%|██████████| 3366/3366 [22:24<00:00,  2.50it/s]


Training loss: 1.804


100%|██████████| 381/381 [00:55<00:00,  6.82it/s]

Validation loss: 2.111
Number of Layers     Training Loss        Validation Loss     
2                    1.8342900619008182   2.090036714796632   
4                    1.8100076372327367   2.101323988493972   
8                    1.80411289804972     2.1107275304518973  





In [49]:
training_losses = []
validation_losses = []
for emb_dim in emb_dims:
    hidden_dim = 4 * emb_dim
    net = Transformer(VOCAB_SIZE, emb_dim, hidden_dim, N_LAYERS, NUM_HEADS, DROPOUT)
    optim = torch.optim.AdamW(net.parameters(), lr=LR, weight_decay=0.01)
    training_losses.append(train(net, optim, trn_loader))
    validation_losses.append(eval(net, val_loader))
table("Embedding Dimension", emb_dims, training_losses, validation_losses)

Epoch: 0/1, Loss: 2.041: 100%|██████████| 3366/3366 [09:14<00:00,  6.07it/s]


Training loss: 2.041


100%|██████████| 381/381 [00:23<00:00, 16.01it/s]


Validation loss: 2.097


Epoch: 0/1, Loss: 1.816: 100%|██████████| 3366/3366 [13:24<00:00,  4.18it/s]


Training loss: 1.816


100%|██████████| 381/381 [00:34<00:00, 11.15it/s]


Validation loss: 2.099


Epoch: 0/1, Loss: 1.592: 100%|██████████| 3366/3366 [28:36<00:00,  1.96it/s]


Training loss: 1.592


100%|██████████| 381/381 [01:09<00:00,  5.49it/s]

Validation loss: 2.300
Embedding Dimension  Training Loss        Validation Loss     
32                   2.0414910014371253   2.096812045793208   
64                   1.8156625077402881   2.098882383561823   
128                  1.5918939977406181   2.3003213881194746  





In [50]:
training_losses = []
validation_losses = []
for block_size in block_sizes:
    BLOCK_SIZE = block_size
    trn_dataset = Dataset(trn_data, BLOCK_SIZE)
    val_dataset = Dataset(val_data, BLOCK_SIZE)
    trn_loader = torch.utils.data.DataLoader(trn_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    net = Transformer(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, N_LAYERS, NUM_HEADS, DROPOUT)
    optim = torch.optim.AdamW(net.parameters(), lr=LR, weight_decay=0.01)
    training_losses.append(train(net, optim, trn_loader))
    validation_losses.append(eval(net, val_loader))
table("Block Size", block_sizes, training_losses, validation_losses)

Epoch: 0/1, Loss: 1.953: 100%|██████████| 3366/3366 [07:13<00:00,  7.77it/s]


Training loss: 1.953


100%|██████████| 381/381 [00:17<00:00, 21.92it/s]


Validation loss: 2.104


Epoch: 0/1, Loss: 1.809: 100%|██████████| 3366/3366 [14:00<00:00,  4.01it/s]


Training loss: 1.809


100%|██████████| 381/381 [00:35<00:00, 10.67it/s]


Validation loss: 2.094


Epoch: 0/1, Loss: 1.708: 100%|██████████| 3366/3366 [34:27<00:00,  1.63it/s]


Training loss: 1.708


100%|██████████| 380/380 [01:30<00:00,  4.19it/s]

Validation loss: 2.166
Block Size           Training Loss        Validation Loss     
16                   1.9534008368026339   2.1041596927667854  
32                   1.8089005991028522   2.0936161282807197  
64                   1.7075570740660102   2.166194853970879   



