In [2]:
import sys

sys.path.append("../src")
from tokenizer import get_tokenizer, get_data
from torch.utils.data import Dataset, DataLoader
from CustomDataLoader import CustomDataset


dataset_path = (
    "/Users/shusanketbasyal/.cache/kagglehub/datasets/jigarpanjiyar/english-to-manipuri-dataset/versions/1"
    + "//english-nepali.xlsx"
)

engtokenizer, neptokenizer = get_tokenizer(dataset_path)

df_train, df_test = get_data(dataset_path, split=True)

In [3]:
engvocabsize = engtokenizer.get_vocab_size()
nepvocabsize = neptokenizer.get_vocab_size()
engvocabsize, nepvocabsize

(30000, 30000)

In [4]:
# from tokenizer import get_tokenizer, get_data
# from torch.utils.data import Dataset, DataLoader
# from CustomDataLoader import CustomDataset


# dataset_path =  "/Users/shusanketbasyal/.cache/kagglehub/datasets/jigarpanjiyar/english-to-manipuri-dataset/versions/1"+"//english-nepali.xlsx"

# engtokenizer, neptokenizer  = get_tokenizer(dataset_path)

# df_train, df_test = get_data(dataset_path, split=True)

# df_train_dataset = CustomDataset(df_train, engtokenizer, neptokenizer, "eng", "nep", 256)
# df_test_dataset = CustomDataset(df_test, engtokenizer, neptokenizer, "eng", "nep", 256)


# df_train_dataloader = DataLoader(df_train_dataset, batch_size=2, shuffle=True)

In [5]:
df_train_dataset = CustomDataset(
    df_train, engtokenizer, neptokenizer, "eng", "nep", 256
)
df_test_dataset = CustomDataset(df_test, engtokenizer, neptokenizer, "eng", "nep", 256)

In [6]:
df_train_dataloader = DataLoader(df_train_dataset, batch_size=1, shuffle=True)

In [7]:
for x in df_train_dataloader:
    enc_input = x["encoder_input"]
    dec_input = x["decoder_input"]
    enc_mask = x["encoder_mask"]
    dec_mask = x["decoder_mask"]

    print(enc_input.shape)
    print(dec_input.shape)
    print(x["label"].shape)
    print(enc_mask.shape)
    print(dec_mask.shape)

    break

torch.Size([1, 256])
torch.Size([1, 256])
torch.Size([1, 256])
torch.Size([1, 256])
torch.Size([1, 256, 256])


In [8]:
print("Done with data loader working on Model Implementation")

Done with data loader working on Model Implementation


In [9]:
# kinda global variable
seq_len = 256
embdim = 512
vocab_size = engvocabsize

In [11]:
import torch
import torch.nn as nn

In [11]:
class InputEmbeddings(nn.Module):
    def __init__(self, vocab_size, embdim):

        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embdim)

    def forward(self, x):
        return self.embeddings(x)

In [36]:
class PositionalEmbeddings(nn.Module):
    def __init__(self, seq_len, embdim):
        super().__init__()

        # self.pe is the lookup matrix where each row represents a position
        # shape  = (SEQ_LEN, EMBDIM)
        self.pe = torch.zeros(seq_len, embdim, dtype=torch.float32)
        # positions is just the sequence of the position from 0,seq_len, shape => (SEQ_LEN, 1)
        positions = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1)
        # SHAPE(256)
        emb_skip_dim = torch.arange(0, embdim, step=2, dtype=torch.float32)
        # (seqlen, 1) / (256) => (seqlen, 256)
        z = positions / (10000 ** (emb_skip_dim / embdim))
        # even 256
        self.pe[:, 0::2] = torch.sin(z)
        # odd 256
        self.pe[:, 1::2] = torch.cos(z)

        # shape of self.pe => (SEQ_LEN, EMBDIM)
        # NEED TO ADD BATCH DIM
        # self.pe = self.pe.unsqueeze(0)
        self.pe = nn.Parameter(self.pe, requires_grad=False)

    def forward(self, x):
        B, T, C = x.shape
        print(x.shape, self.pe.shape)
        # for training all will have T but when generating not all will have seq of T
        x = x + self.pe[:T, :]
        # return x

In [37]:
pe = torch.zeros(seq_len, embdim, dtype=torch.float32)
# positions is just the sequence of the position from 0,seq_len, shape => (SEQ_LEN, 1)
positions = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1)
# SHAPE(256)
emb_skip_dim = torch.arange(0, embdim, step=2, dtype=torch.float32)
# (seqlen, 1) / (256) => (seqlen, 256)
z = positions / (10000 ** (emb_skip_dim / embdim))
# even 256
pe[:, 0::2] = torch.sin(z)
# odd 256
pe[:, 1::2] = torch.cos(z)

In [38]:
model = InputEmbeddings(vocab_size, embdim)
out = model(enc_input)

In [39]:
model = PositionalEmbeddings(seq_len, embdim)
out = model(out)

torch.Size([1, 256, 512]) torch.Size([256, 512])


In [54]:
a = torch.randint(2, 10, (2, 3, 4))
b = torch.randint(2, 10, (5, 4))

In [55]:
a, a.shape
B, T, C = a.shape

In [56]:
b

tensor([[5, 3, 9, 8],
        [3, 5, 5, 6],
        [3, 7, 6, 9],
        [4, 2, 9, 2],
        [9, 7, 3, 3]])

In [57]:
a

tensor([[[4, 2, 9, 9],
         [3, 6, 5, 6],
         [5, 4, 3, 8]],

        [[5, 9, 5, 9],
         [2, 9, 4, 2],
         [6, 5, 8, 4]]])

In [58]:
a + b[:T, :]

tensor([[[ 9,  5, 18, 17],
         [ 6, 11, 10, 12],
         [ 8, 11,  9, 17]],

        [[10, 12, 14, 17],
         [ 5, 14,  9,  8],
         [ 9, 12, 14, 13]]])

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embdim, num_heads):
        super().__init__()
        self.query = nn.Linear(embdim, embdim)
        self.key = nn.Linear(embdim, embdim)
        self.value = nn.Linear(embdim, embdim)
        # after computation, this is used for projection
        self.proj = nn.Linear(embdim, embdim)
        self.num_heads = num_heads
        assert embdim % self.num_heads == 0, "{embdim} is not divisible by {num_heads}"
        self.head_dim = embdim // num_heads

    @staticmethod
    def attention(q, k, v, mask):
        head_dim = q.shape[-1]

        attention_scores = (q @ k.transpose(-2, -1)) / (head_dim) ** (1 / 2)
        if mask is not None:
            attention_scores.masked_fill(mask == 0, float("-inf"))

        attention_scores = attention_scores.softmax(dim=-1)

        return (attention_scores @ v), attention_scores

    def forward(self, q, k, v, mask):
        B, T, C = x.shape
        # (B,T,C) => (B,T,C)
        q = self.query(q)
        k = self.key(k)
        v = self.key(v)

        # reshape
        # (B,T,C) => (B,NH, T, H)
        q = q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

        output, attentionscores = MultiHeadAttention.attention(q, k, v, mask)
        # shape of output => (B,NH, SEQ, HDIM)
        # TO => (B, SEQ, EMBDIM)
        output = output.transpose(1, 2).contiguous().view(B, T, C)
        return self.proj(output)

In [None]:
class FeedForward(nn.Module):
    def __init__(self, embdim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(embdim, 4 * embdim), nn.ReLU(), nn.Linear(4 * embdim, embdim)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
class LayerNormalization(nn.Module):

    def __init__(self, embdim):
        super().__init__()
        self.embdim = embdim
        self.alpha = torch.ones(embdim)
        self.beta = torch.zeros(embdim)

    def forward(self, x):
        xmean = x.mean(dim=-1, keepdim=True)
        xstd = x.std(dim=-1, keepdim=True)
        return self.alpha * ((x - xmean) / xstd + 1e-6) + self.beta

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, embdim, num_heads):
        super().__init__()
        self.attn = MultiHeadAttention(embdim, num_heads)
        self.feedfwd = FeedForward(embdim)
        self.layernorm1 = LayerNormalization(embdim)
        self.layernorm2 = LayerNormalization(embdim)

    def forward(self, x, mask):
        x = x + self.attn(x, x, x, mask)
        x = self.layernorm1(x)
        x = x + self.feedfwd(x)
        return self.layernorm2(x)

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, embdim, num_heads):
        super().__init__()
        self.attn = MultiHeadAttention(embdim, num_heads)
        self.cross_attn = MultiHeadAttention(embdim, num_heads)
        self.feedfwd = FeedForward(embdim)
        self.layernorm1 = LayerNormalization(embdim)
        self.layernorm2 = LayerNormalization(embdim)
        self.layernorm3 = LayerNormalization(embdim)

    def forward(self, encoder_output, x, src_mask, tgt_mask):
        x  = x + self.attn(x,x,x,tgt_mask)
        x  = self.layernorm1(x)
        x = x + self.attn(x,encoder_output, encoder_output, src_mask)
        x = self.layernorm2(x)
        x = x + self.feedfwd(x)
        x = self.layernorm3(x)
        return x 



In [None]:
class Encoder(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return x

In [None]:
class Decoder(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return x

In [None]:
class FinalProjectionLayer(nn.Module):
    def __init__(self, embdim, vocab_size):
        super().__init__()
        self.linear = nn.Linear(embdim, vocab_size)

    def forward(self, x):
        return self.linear(x)

In [1]:
class Transformer(nn.Module):

    def __init__(
        self, encoder, decoder, src_emb, tgt_emb, src_pos, tgt_pos, finalprojectionlayer
    ):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_emb = src_emb
        self.src_pos = src_pos
        self.tgt_emb = tgt_emb
        self.tgt_pos = tgt_pos
        self.finalprojectionlayer = finalprojectionlayer

    def encoder_func(self, x, src_mask):
        x = self.src_emb(x)
        x = self.src_pos(x)
        return self.encoder(x, src_mask)

    def decoder_func(self, encoder_output, x, src_mask, tgt_mask):
        x = self.tgt_emb(x)
        x = self.tgt_pos(x)
        return self.decoder(encoder_output, x, src_mask, tgt_mask)

    def projection(self, x):
        return self.finalprojectionlayer(x)

NameError: name 'nn' is not defined

In [None]:
def build_transformer(
    src_vocab_size,
    src_seq_len,
    tgt_vocab_size,
    tgt_seq_len,
    embdim,
    encoder_depth,
    decoder_depth,
    num_heads,
):
    src_emb = InputEmbeddings(src_vocab_size, embdim)
    src_pos = PositionalEmbeddings(src_seq_len, embdim)

    tgt_emb = InputEmbeddings(tgt_vocab_size, embdim)
    tgt_pos = PositionalEmbeddings(tgt_seq_len, embdim)

    encoder_blocks = []
    for _ in range(encoder_depth):
        encoder_block = EncoderBlock(embdim, num_heads)
        encoder_blocks.append(encoder_block)

    decoder_blocks = []

    for _ in range(decoder_depth):
        decoder_block = DecoderBlock(embdim, num_heads)
        decoder_blocks.append(decoder_block)

    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))
    finalprojectionlayer = FinalProjectionLayer(embdim, tgt_vocab_size)

    transformer = Transformer(
        encoder, decoder, src_emb, tgt_emb, src_pos, tgt_pos, finalprojectionlayer
    )

    return transformer

In [None]:
build_transformer(300, 300, 3000, 300, 300, 2, 1, 10)

In [1]:
enc_input

NameError: name 'enc_input' is not defined

In [8]:
enc_input.shape, enc_mask.shape

(torch.Size([1, 256]), torch.Size([1, 256]))

In [35]:
a = torch.randn(1, 8, 256, 256)

In [36]:
a.masked_fill_(enc_mask == 0, float("-inf"))

tensor([[[[ 8.1662e-02,  2.5967e-01,  2.8760e+00,  ...,        -inf,
                  -inf,        -inf],
          [-1.1152e+00,  1.0214e+00,  2.1114e-01,  ...,        -inf,
                  -inf,        -inf],
          [ 1.4577e+00, -4.3899e-01,  4.3423e-01,  ...,        -inf,
                  -inf,        -inf],
          ...,
          [-1.4072e-01,  4.2788e-01,  1.1357e+00,  ...,        -inf,
                  -inf,        -inf],
          [-1.1005e+00,  4.6406e-01,  8.1126e-01,  ...,        -inf,
                  -inf,        -inf],
          [ 1.0194e+00,  3.5561e-01,  7.4649e-01,  ...,        -inf,
                  -inf,        -inf]],

         [[-5.2770e-01, -7.0048e-01, -1.3999e+00,  ...,        -inf,
                  -inf,        -inf],
          [-1.0384e+00,  1.5051e-01, -1.1157e-01,  ...,        -inf,
                  -inf,        -inf],
          [ 2.5812e+00, -1.1150e+00,  1.2218e+00,  ...,        -inf,
                  -inf,        -inf],
          ...,
     

In [37]:
a[0][0][0][200:]

tensor([-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf])

In [31]:
enc_mask[0][100:]

tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)