In [1]:
import sys

sys.path.append("../src")
from tokenizer import get_tokenizer, get_data
from torch.utils.data import Dataset, DataLoader
from CustomDataLoader import CustomDataset


dataset_path = (
    "/Users/shusanketbasyal/.cache/kagglehub/datasets/jigarpanjiyar/english-to-manipuri-dataset/versions/1"
    + "//english-nepali.xlsx"
)

engtokenizer, neptokenizer = get_tokenizer(dataset_path)

df_train, df_test = get_data(dataset_path, split=True)

In [13]:
engvocabsize = engtokenizer.get_vocab_size()
nepvocabsize = neptokenizer.get_vocab_size()
engvocabsize, nepvocabsize

(30000, 30000)

In [2]:
df_train_dataset = CustomDataset(
    df_train, engtokenizer, neptokenizer, "eng", "nep", 256
)
df_test_dataset = CustomDataset(df_test, engtokenizer, neptokenizer, "eng", "nep", 256)

In [6]:
df_train_dataloader = DataLoader(df_train_dataset, batch_size=1, shuffle=True)

In [74]:
for x in df_train_dataloader:
    enc_input = x["encoder_input"]
    dec_input = x["decoder_input"]
    enc_mask = x["encoder_mask"]
    dec_mask = x["decoder_mask"]

    print(enc_input.shape)
    print(dec_input.shape)
    print(x["label"].shape)
    print(enc_mask.shape)
    print(dec_mask.shape)

    break

torch.Size([1, 256])
torch.Size([1, 256])
torch.Size([1, 256])
torch.Size([1, 256])
torch.Size([1, 256, 256])


In [8]:
print("Done with data loader working on Model Implementation")

Done with data loader working on Model Implementation


In [14]:
# kinda global variable
seq_len = 256
embdim = 512
vocab_size = engvocabsize

In [19]:
import torch
import torch.nn as nn

In [16]:
class InputEmbeddings(nn.Module):
    def __init__(self, vocab_size, embdim):

        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embdim)

    def forward(self, x):
        return self.embeddings(x)

In [30]:
class PositionalEmbeddings(nn.Module):
    def __init__(self, seq_len, embdim):
        super().__init__()

        # self.pe is the lookup matrix where each row represents a position
        # shape  = (SEQ_LEN, EMBDIM)
        self.pe = torch.zeros(seq_len, embdim, dtype=torch.float32)
        # positions is just the sequence of the position from 0,seq_len, shape => (SEQ_LEN, 1)
        positions = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1)
        # SHAPE(256)
        emb_skip_dim = torch.arange(0, embdim, step=2, dtype=torch.float32)
        # (seqlen, 1) / (256) => (seqlen, 256)
        z = positions / (10000 ** (emb_skip_dim / embdim))
        # even 256
        self.pe[:, 0::2] = torch.sin(z)
        # odd 256
        self.pe[:, 1::2] = torch.cos(z)

        # shape of self.pe => (SEQ_LEN, EMBDIM)
        # NEED TO ADD BATCH DIM
        self.pe = self.pe.unsqueeze(0)
        self.pe = nn.Parameter(self.pe, requires_grad=False)

    def forward(self, x):
        B, T, C = x.shape
        # for training all will have T but when generating not all will have seq of T
        x = x + self.pe[:, :T, :]
        return x

In [32]:
# enc_input, dec_input
model1 = InputEmbeddings(vocab_size, embdim)
model2 = PositionalEmbeddings(seq_len, embdim)

In [33]:
enc_input.shape
# (B, SEQ_LEN)

torch.Size([1, 256])

In [36]:
x1 = model1(enc_input)
x1.shape
# B, T, C

torch.Size([1, 256, 512])

In [37]:
x2 = model2(x1)
x2.shape

torch.Size([1, 256, 512])

In [101]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embdim, num_heads):
        super().__init__()
        self.query = nn.Linear(embdim, embdim)
        self.key = nn.Linear(embdim, embdim)
        self.value = nn.Linear(embdim, embdim)
        # after computation, this is used for projection
        self.proj = nn.Linear(embdim, embdim)
        self.num_heads = num_heads
        assert embdim % self.num_heads == 0, "{embdim} is not divisible by {num_heads}"
        self.head_dim = embdim // num_heads

    @staticmethod
    def attention(q, k, v, mask):
        head_dim = q.shape[-1]

        attention_scores = (q @ k.transpose(-2, -1)) / (head_dim) ** (1 / 2)
        if mask is not None:
            attention_scores.masked_fill(mask == 0, float("-inf"))

        attention_scores = attention_scores.softmax(dim=-1)

        return (attention_scores @ v), attention_scores

    def forward(self, q, k, v, mask):
        B, T, C = x.shape
        # (B,T,C) => (B,T,C)
        q = self.query(q)
        k = self.key(k)
        v = self.key(v)

        # reshape
        # (B,T,C) => (B,NH, T, H)
        q = q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

        output, attentionscores = MultiHeadAttention.attention(q, k, v, mask)
        # shape of output => (B,NH, SEQ, HDIM)
        # TO => (B, SEQ, EMBDIM)
        output = output.transpose(1, 2).contiguous().view(B, T, C)
        return self.proj(output)

In [102]:
enc_input.shape, enc_mask.shape

(torch.Size([1, 256]), torch.Size([1, 256]))

In [104]:
a = nn.Linear(embdim, embdim)
b = nn.Linear(embdim, embdim)
x = PositionalEmbeddings(seq_len, embdim)(
    InputEmbeddings(vocab_size, embdim)(enc_input)
)
x = MultiHeadAttention(embdim, 8)(x, x, x, dec_mask)
x.shape

torch.Size([1, 256, 512])

In [90]:
q = a(x)
k = b(x)

In [91]:
B, T, C = q.shape

In [92]:
q = q.view(B, T, 8, 64).transpose(1, 2)
k = k.view(B, T, 8, 64).transpose(1, 2)

In [93]:
q.shape, k.shape

(torch.Size([1, 8, 256, 64]), torch.Size([1, 8, 256, 64]))

In [94]:
wei = q @ k.transpose(-2, -1)

In [95]:
wei.shape, dec_mask.shape

(torch.Size([1, 8, 256, 256]), torch.Size([1, 256, 256]))

In [96]:
dec_mask[0][0][:10]

tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)

In [105]:
wei.shape

torch.Size([1, 8, 256, 256])