In [1]:
import torch
import numpy as np
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import math



In [2]:
class SubstringDataset(Dataset):
    LETTERS = list('cpen')

    def __init__(self, seed, dataset_size, str_len=20):
        super().__init__()
        self.str_len = str_len
        self.dataset_size = dataset_size
        self.rng = np.random.default_rng(seed)
        self.strings, self.labels = self._create_dataset()

    def __getitem__(self, index):
        return self.strings[index], self.labels[index]

    def __len__(self):
        return self.dataset_size

    def _create_dataset(self):
        strings, labels = [], []
        for i in range(self.dataset_size):
            label = i%2
            string = self._generate_random_string(bool(label))
            strings.append(string)
            labels.append(label)
        return strings, labels

    def _generate_random_string(self, has_cpen):
        while True:
            st = ''.join(self.rng.choice(SubstringDataset.LETTERS, size=self.str_len))
            if ('cpen' in st) == has_cpen:
                return st

In [3]:
class Tokenizer():
    def __init__(self) -> None:
        self.vocab = {
            '[CLS]': 0,
            'c': 1,
            'p': 2,
            'e': 3,
            'n': 4,
        }

    def tokenize_string(self, string, add_cls_token=True) -> torch.Tensor:
        """
        Tokenize the input string according to the above vocab

        START BLOCK
        """
        tokenized_string = None
        """
        END BLOCK
        """
        return tokenized_string

    def tokenize_string_batch(self, strings, add_cls_token=True):
        X = []
        for s in strings:
            X.append(self.tokenize_string(s, add_cls_token=add_cls_token))
        return torch.stack(X, dim=0)



In [4]:
class AbsolutePositionalEncoding(nn.Module):
    MAX_LEN = 256
    def __init__(self, d_model):
        super().__init__()
        self.W = nn.Parameter(torch.empty((self.MAX_LEN, d_model)))
        nn.init.normal_(self.W)

    def forward(self, x):
        """
        args:
            x: shape B x N x D
        returns:
            out: shape B x N x D
        START BLOCK
        """
        out = None
        """
        END BLOCK
        """
        return out

class MultiHeadAttention(nn.Module):
    MAX_LEN = 256

    def __init__(self, d_model, n_heads, rpe):
        super().__init__()
        assert d_model % n_heads == 0, "Number of heads must divide number of dimensions"
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_h = d_model // n_heads
        self.rpe = rpe
        self.Wq = nn.ParameterList([nn.Parameter(torch.empty((d_model, self.d_h))) for _ in range(n_heads)])
        self.Wk = nn.ParameterList([nn.Parameter(torch.empty((d_model, self.d_h))) for _ in range(n_heads)])
        self.Wv = nn.ParameterList([nn.Parameter(torch.empty((d_model, self.d_h))) for _ in range(n_heads)])
        self.Wo = nn.Parameter(torch.empty((d_model, d_model)))

        if rpe:
            # -MAX_LEN, -MAX_LEN+1, ..., -1, 0, 1, ..., MAX_LEN-1, MAXLEN
            self.rpe_w = nn.ParameterList([nn.Parameter(torch.empty((2*self.MAX_LEN+1, ))) for _ in range(n_heads)])

        for h in range(self.n_heads):
            nn.init.xavier_normal_(self.Wk[h])
            nn.init.xavier_normal_(self.Wq[h])
            nn.init.xavier_normal_(self.Wv[h])
            if rpe:
                nn.init.normal_(self.rpe_w[h])
        nn.init.xavier_normal_(self.Wo)

    def forward(self, key, query, value):
        """
        args:
            key: shape B x N x D
            query: shape B x N x D
            value: shape B x N x D
        return:
            out: shape B x N x D
        START BLOCK
        """
        out = None
        """
        END BLOCK
        """
        return out


In [5]:
class TransformerLayer(nn.Module):
    def __init__(self, d_model: int, n_heads: int, prenorm: bool, rpe: bool):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.prenorm = prenorm
        self.attention = MultiHeadAttention(d_model, n_heads, rpe=rpe)
        self.fc_W1 = nn.Parameter(torch.empty((d_model, 4*d_model)))
        self.fc_W2 = nn.Parameter(torch.empty((4*d_model, d_model)))
        self.relu = nn.ReLU()
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

        nn.init.xavier_normal_(self.fc_W1)
        nn.init.xavier_normal_(self.fc_W2)

    def forward(self, x):
        """
        args:
            x: shape B x N x D
        returns:
            out: shape B x N x D
        START BLOCK
        """
        out = None
        """
        END BLOCK
        """
        return out

In [6]:
class ModelConfig:
    n_layers = 4
    input_dim = 5
    d_model = 256
    n_heads = 4
    prenorm = True
    pos_enc_type = 'ape' # 'ape': Abosolute Pos. Enc., 'rpe': Relative Pos. Enc.
    output_dim = 1 # Binary output: 0: invalid, 1: valid

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            assert hasattr(self, k)
            self.__setattr__(k, v)

class TransformerModel(nn.Module):
    def __init__(self, cfg: ModelConfig):
        super().__init__()
        self.cfg = cfg
        self.enc_W = nn.Parameter(torch.empty((cfg.input_dim, cfg.d_model)))
        if cfg.pos_enc_type == 'ape':
            self.ape = AbsolutePositionalEncoding(d_model=cfg.d_model)
        self.transformer_layers = nn.ModuleList([
            TransformerLayer(d_model=cfg.d_model, n_heads=cfg.n_heads, prenorm=cfg.prenorm, rpe=cfg.pos_enc_type == 'rpe') for _ in range(cfg.n_layers)
        ])
        self.dec_W = nn.Parameter(torch.empty((cfg.d_model, cfg.output_dim)))

        nn.init.xavier_normal_(self.enc_W)
        nn.init.xavier_normal_(self.dec_W)

    def forward(self, x):
        """
        args:
            x: shape B x N x D_in
        returns:
            out: shape B x N x D_out
        START BLOCK
        """
        out = None
        """
        END BLOCK
        """
        return out

In [7]:
from torch.optim import lr_scheduler

class CustomScheduler(lr_scheduler._LRScheduler):
    def __init__(self, optimizer, total_steps, warmup_steps=1000):
        self.total_steps = total_steps
        self.warmup_steps = warmup_steps
        super().__init__(optimizer)

    def get_lr(self):
        """
        Compute the custom scheduler with warmup and cooldown
        Hint: self.last_epoch contains the current step number
        START BLOCK
        """
        mult_factor = 1.0
        """
        END BLOCK
        """
        return [group['initial_lr'] * mult_factor for group in self.optimizer.param_groups]

In [8]:
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class TrainerConfig:
    lr = 0.003
    train_steps = 5000
    batch_size = 256
    evaluate_every = 100
    device = 'cpu'

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            assert hasattr(self, k)
            self.__setattr__(k, v)

class Trainer:
    def __init__(self, model, cfg: TrainerConfig):
        self.cfg = cfg
        self.device = cfg.device
        self.tokenizer = Tokenizer()
        self.model = model.to(self.device)

    def train(self, train_dataset, val_dataset):
        optimizer = optim.Adam(self.model.parameters(), lr=self.cfg.lr)
        scheduler = CustomScheduler(optimizer, self.cfg.train_steps)
        train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=self.cfg.batch_size)
        for step in range(self.cfg.train_steps):
            self.model.train()
            batch = next(iter(train_dataloader))
            strings, y = batch
            x = self.tokenizer.tokenize_string_batch(strings)

            optimizer.zero_grad()
            loss, _ = self.compute_batch_loss_acc(x, y)
            loss.backward()
            optimizer.step()
            scheduler.step()
            if step % self.cfg.evaluate_every == 0:
                val_loss, val_acc = self.evaluate_dataset(val_dataset)
                print(f"Step {step}: Train Loss={loss.item()}, Val Loss: {val_loss}, Val Accuracy: {val_acc}")

    def compute_batch_loss_acc(self, x, y):
        """
        Compute the loss and accuracy of the model on batch (x, y)
        args:
            x: B x N x D_in
            y: B
        return:
            loss, accuracy
        START BLOCK
        """
        loss, acc = torch.tensor([1.0]), torch.tensor([0.0])
        """
        END BLOCK
        """
        return loss, acc

    @torch.no_grad()
    def evaluate_dataset(self, dataset):
        self.model.eval()
        dataloader = DataLoader(dataset, shuffle=False, batch_size=self.cfg.batch_size)
        final_loss, final_acc = 0.0, 0.0
        for batch in dataloader:
            strings, y = batch
            x = self.tokenizer.tokenize_string_batch(strings)
            loss, acc = self.compute_batch_loss_acc(x, y)
            final_loss += loss.item() * x.size(0)
            final_acc += acc.item() * x.size(0)
        return final_loss / len(dataset), final_acc / len(dataset)


In [9]:
"""
In case you were not successful in implementing some of the above classes,
you may reimplement them using pytorch available nn Modules here to receive the marks for part 1.8
If your implementation of the previous parts is correct, leave this block empty.
START BLOCK
"""


"""
END BLOCK
"""
def run_transformer():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = TransformerModel(ModelConfig())
    trainer = Trainer(model, TrainerConfig(device=device))
    parantheses_size=16
    print("Creating datasets.")
    train_dataset = SubstringDataset(seed=1, dataset_size=10_000, str_len=parantheses_size)
    val_dataset = SubstringDataset(seed=2, dataset_size=1_000, str_len=parantheses_size)
    test_dataset = SubstringDataset(seed=3, dataset_size=1_000, str_len=parantheses_size)

    print("Training the model.")
    trainer.train(train_dataset, val_dataset)
    test_loss, test_acc = trainer.evaluate_dataset(test_dataset)
    print(f"Final Test Accuracy={test_acc}, Test Loss={test_loss}")

In [10]:
run_transformer()

Creating datasets.
Training the model.


TypeError: expected Tensor as element 0 in argument 0, but got NoneType

# Unit Tests

In [None]:
import random
import numpy as np

def seed_all():
    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)

class TransformerUnitTest:
    def __init__(self, gt_vars: dict, verbose=False):
        self.gt_vars = gt_vars
        self.verbose = verbose

    def test_all(self):
        self.test_tokenizer()
        self.test_ape()
        self.test_mha()
        self.test_transformer_layer()
        self.test_transformer_model()
        self.test_scheduler()
        self.test_loss()

    def test_tokenizer(self):
        seed_all()
        self.check_correctness(
            Tokenizer().tokenize_string('ccpeen', add_cls_token=True),
            self.gt_vars['tokenizer_1'],
            "Tokenization with cls class"
        )
        self.check_correctness(
            Tokenizer().tokenize_string('cpppencpen', add_cls_token=False),
            self.gt_vars['tokenizer_2'],
            "Tokenization without cls class"
        )

    def test_ape(self):
        seed_all()
        ape_result = AbsolutePositionalEncoding(128)(torch.randn((8, 12, 128)))
        self.check_correctness(ape_result, self.gt_vars['ape'], "APE")

    def test_mha(self):
        seed_all()
        mha_result = MultiHeadAttention(d_model=128, n_heads=4, rpe=False)(
            torch.randn((8, 12, 128)), torch.randn((8, 12, 128)), torch.randn((8, 12, 128))
        )
        self.check_correctness(
            mha_result,
            self.gt_vars['mha_no_rpe'],
            "Multi-head Attention without RPE"
        )
        mha_result_rpe = MultiHeadAttention(d_model=128, n_heads=8, rpe=True)(
            torch.randn((8, 12, 128)), torch.randn((8, 12, 128)), torch.randn((8, 12, 128))
        )
        self.check_correctness(
            mha_result_rpe,
            self.gt_vars['mha_with_rpe'],
            "Multi-head Attention with RPE"
        )

    def test_transformer_layer(self):
        seed_all()
        for prenorm in [True, False]:
            transformer_layer_result = TransformerLayer(
                d_model=128, n_heads=4, prenorm=prenorm, rpe=False
            )(torch.randn((8, 12, 128)))
            self.check_correctness(
                transformer_layer_result,
                self.gt_vars[f'transformer_layer_prenorm_{prenorm}'],
                f"Transformer Layer Prenorm {prenorm}"
            )

    def test_transformer_model(self):
        seed_all()
        transformer_model_result = TransformerModel(
            ModelConfig(d_model=128, prenorm=True, pos_enc_type='ape')
        )(torch.randn((8, 12, 5)))
        self.check_correctness(
            transformer_model_result,
            self.gt_vars['transformer_model_result'],
            f"Transformer Model"
        )

    def test_scheduler(self):
        model = TransformerModel(ModelConfig())
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        scheduler = CustomScheduler(optimizer, 10_000)
        optimizer.step()
        scheduler.step(521)
        self.check_correctness(
            torch.tensor([optimizer.param_groups[0]['lr']]),
            self.gt_vars['scheduler_1'],
            f"Scheduler Warmup"
        )
        scheduler.step(2503)
        self.check_correctness(
            torch.tensor([optimizer.param_groups[0]['lr']]),
            self.gt_vars['scheduler_2'],
            f"Scheduler Cooldown"
        )

    def test_loss(self):
        seed_all()
        model = TransformerModel(ModelConfig())
        trainer = Trainer(model, TrainerConfig(device='cpu'))
        loss_result, _ = trainer.compute_batch_loss_acc(
            torch.randn((8, 12, 5)),
            torch.ones(8).float(),
        )
        self.check_correctness(
            loss_result,
            self.gt_vars['loss'],
            f"Batch Loss"
        )

    def check_correctness(self, out, gt, title):
        try:
            diff = (out - gt).norm()
        except:
            diff = float('inf')
        if diff < 1e-4:
            print(f"[Correct] {title}")
        else:
            print(f"[Wrong] {title}")
            if self.verbose:
                print("-----")
                print("Expected: ")
                print(gt)
                print("Received: ")
                print(out)
                print("-----")


In [None]:
!gdown 1-2-__6AALEfqhfew3sJ2QiCE1-rrFMnQ -q -O unit_tests.pkl
import pickle
with open('unit_tests.pkl', 'rb') as f:
    gt_vars = pickle.load(f)

In [None]:
TransformerUnitTest(gt_vars, verbose=False).test_all()