In [38]:
import random

def generate_random_dates(n=10000):
    max_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    dataset = set()
    months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

    for _ in range(n):
        y = random.randint(1000, 2025)
        m = random.randint(1, 12)

        if (y % 4 == 0 and y % 100 != 0) or (y % 400 == 0):
            max_days[1] = 29
        else:
            max_days[1] = 28

        d = random.randint(1, max_days[m - 1])
        dataset.add((f"{y:04d}-{m:02d}-{d:02d}", f"{months[m - 1]} {d}, {y}"))

    return list(dataset)


In [4]:
import pandas as pd
dataset = generate_random_dates()
df = pd.DataFrame(dataset, columns=["x", "y"])
df

Unnamed: 0,x,y
0,1676-11-30,"November 30, 1676"
1,1162-02-03,"February 3, 1162"
2,1394-03-23,"March 23, 1394"
3,1616-07-06,"July 6, 1616"
4,1647-05-03,"May 3, 1647"
...,...,...
9849,1577-03-14,"March 14, 1577"
9850,1763-05-23,"May 23, 1763"
9851,1998-08-29,"August 29, 1998"
9852,1087-10-21,"October 21, 1087"


In [5]:
df["y"].apply(lambda r : len(r)).max()

np.int64(18)

In [39]:
class Tokenizer:
    def __init__(self):
        nums = [str(i) for i in range(10)]
        uppers = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
        lowers = [chr(i) for i in range(ord('a'), ord('z') + 1)]
        self.input_max = 10 + 2
        self.output_max = 18 + 2
        self.vocab = nums + uppers + lowers + ["-", ",", " ", "<sos>", "<eos>", "<pad>"]
        self.vocab_size = len(self.vocab)
        self.tokens_to_ids = {str(self.vocab[i]): i for i in range(len(self.vocab))}
        self.ids_to_tokens = {str(i): str(self.vocab[i]) for i in range(len(self.vocab))}

        self.pad_token_id = self.tokens_to_ids["<pad>"]

    def encode(self, sample):
        x, y = sample
        x = ["<sos>"] + list(x) + ["<eos>"]
        y = ["<sos>"] + list(y) + ["<eos>"]
        while len(x) != self.input_max: x.append("<pad>")
        while len(y) != self.output_max: y.append("<pad>")
        res_x = [self.tokens_to_ids[i] for i in x]
        res_y = [self.tokens_to_ids[i] for i in y]

        return (res_x, res_y)

    def decode(self, ids):
        res = [self.ids_to_tokens[str(i)] for i in ids]
        return "".join(res)

In [38]:
tokenizer = Tokenizer()
a = dataset[0]
encoded_x, encoded_y = tokenizer.encode(a)
tokenizer.decode(encoded_x), a[0], encoded_x, "-----", tokenizer.decode(encoded_y), a[1], encoded_y

('<sos>1676-11-30<eos>',
 '1676-11-30',
 [65, 1, 6, 7, 6, 62, 1, 1, 62, 3, 0, 66],
 '-----',
 '<sos>November 30, 1676<eos><pad>',
 'November 30, 1676',
 [65, 23, 50, 57, 40, 48, 37, 40, 53, 64, 3, 0, 63, 64, 1, 6, 7, 6, 66, 67])

In [23]:
tokenizer.vocab_size,tokenizer.tokens_to_ids["z"], a, encoded_x

(68,
 61,
 ('1676-11-30', 'November 30, 1676'),
 [65, 1, 6, 7, 6, 62, 1, 1, 62, 3, 0, 66])

In [27]:
import torch
from torch.utils.data import Dataset

class DateDataset(Dataset):
    def __init__(self,data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_encoded, y_encoded = self.tokenizer.encode(self.data[idx])
        
        x_tensor = torch.tensor(x_encoded, dtype=torch.long)
        y_tensor = torch.tensor(y_encoded, dtype=torch.long)

        return x_tensor, y_tensor
        

In [33]:
from torch.utils.data import DataLoader

date_dataset = DateDataset(dataset, tokenizer)
dataloader = DataLoader(date_dataset, batch_size=5, shuffle=True)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
    print(x_batch)
    print(y_batch)
    break

tensor([[65,  1,  8,  3,  2, 62,  0,  5, 62,  1,  7, 66],
        [65,  1,  7,  8,  4, 62,  1,  0, 62,  0,  2, 66],
        [65,  1,  2,  7,  6, 62,  0,  5, 62,  0,  5, 66],
        [65,  1,  3,  0,  7, 62,  1,  0, 62,  2,  3, 66],
        [65,  1,  9,  2,  2, 62,  0,  5, 62,  1,  5, 66]])
tensor([[65, 22, 36, 60, 64,  1,  7, 63, 64,  1,  8,  3,  2, 66, 67, 67, 67, 67,
         67, 67],
        [65, 24, 38, 55, 50, 37, 40, 53, 64,  2, 63, 64,  1,  7,  8,  4, 66, 67,
         67, 67],
        [65, 22, 36, 60, 64,  5, 63, 64,  1,  2,  7,  6, 66, 67, 67, 67, 67, 67,
         67, 67],
        [65, 24, 38, 55, 50, 37, 40, 53, 64,  2,  3, 63, 64,  1,  3,  0,  7, 66,
         67, 67],
        [65, 22, 36, 60, 64,  1,  5, 63, 64,  1,  9,  2,  2, 66, 67, 67, 67, 67,
         67, 67]])


In [40]:
import torch
import torch.nn as nn

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, token_ids):
        return self.embedding(token_ids)

In [41]:
import torch
import torch.nn as nn
import math
class PositionalEncoder(nn.Module):
    def __init__(self, embed_dim, max_len, n = 10000):
        super().__init__()
        self.n = n
        pe = torch.zeros(max_len, embed_dim)
        positions = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(self.n) / embed_dim))
                
        pe[:, 0::2] = torch.sin(positions * div_term)
        pe[:, 1::2] = torch.cos(positions * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        pe = self.pe[:seq_len, :]
        pe = pe.unsqueeze(0)
        pe = pe.expand(x.size(0), -1, -1)
        # print(x[0], pe[0])
        return x + pe

In [48]:
import math


class EmbeddingBlock(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len, pad_id, dropout=0.1):
        super().__init__()
        self.token_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_emb = PositionalEncoder(embed_dim, max_len)
        self.dropout = nn.Dropout(dropout)
        self.embed_dim = embed_dim

    def forward(self, token_ids):
        tok_emb = self.token_emb(token_ids) * math.sqrt(self.embed_dim)
        # print("te",tok_emb)
        x = self.pos_emb(tok_emb)
        # print("pe", x)
        return self.dropout(x)


In [115]:
toy = torch.tensor([[65, 0, 62, 65]])
src_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)


In [116]:
src_emb = src_embedding_block(toy)


te tensor([[[-1.0169,  3.7117, -3.0031,  0.0366, -2.0964, -4.5249, -5.8190,
           0.1423,  6.3528, -5.9342, -2.7633, -6.5703, -1.8161, -1.1127,
          -3.3698,  1.6234],
         [ 2.5321, -0.1290, -0.9900, -3.7510,  4.6948,  0.8859, -1.5571,
          -2.4139, -8.1228,  1.2832, -0.7425,  1.6549, -2.8551,  2.4007,
           5.5863,  6.4642],
         [-1.2332, -1.0142, -4.4233,  1.6503,  0.4995,  2.1766, -4.7730,
          -1.4999,  3.7029,  0.3568,  4.4813, -2.4508, -3.3141,  8.0293,
          -2.5632, -2.9621],
         [-1.0169,  3.7117, -3.0031,  0.0366, -2.0964, -4.5249, -5.8190,
           0.1423,  6.3528, -5.9342, -2.7633, -6.5703, -1.8161, -1.1127,
          -3.3698,  1.6234]]], grad_fn=<MulBackward0>)
pe tensor([[[-1.0169,  4.7117, -3.0031,  1.0366, -2.0964, -3.5249, -5.8190,
           1.1423,  6.3528, -4.9342, -2.7633, -5.5703, -1.8161, -0.1127,
          -3.3698,  2.6234],
         [ 3.3736,  0.4113, -0.6790, -2.8006,  4.7947,  1.8809, -1.5255,
          -1.4144, -

In [None]:
embed_dim = 16
max_seq_len = 20

src_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)
tgt_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader[:5]):
    src_emb = src_embedding_block(x_batch)
    tgt_emb = tgt_embedding_block(y_batch)
    print(src_emb.shape, tgt_emb.shape)
    break

# Attention

In [45]:
import pandas as pd
from torch.utils.data import DataLoader

dataset = generate_random_dates()
df = pd.DataFrame(dataset, columns=["x", "y"])

date_dataset = DateDataset(dataset, tokenizer)
dataloader = DataLoader(date_dataset, batch_size=5, shuffle=True)

In [105]:
embed_dim = 16
max_seq_len = 20

src_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)
tgt_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
    src_emb = src_embedding_block(x_batch)
    tgt_emb = tgt_embedding_block(y_batch)
    print(src_emb.shape, tgt_emb.shape)
    break

torch.Size([5, 12, 16]) torch.Size([5, 20, 16])


In [106]:
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.embed_dim = embed_dim

        self.W_q = nn.Linear(embed_dim, embed_dim)
        self.W_k = nn.Linear(embed_dim, embed_dim)
        self.W_v = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        print(x[0])
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)
        print(Q[0])

        scores = Q @ K.transpose(-2, -1) # QK^T / sqrt(d_k)

        print(scores[0])
        scores = F.softmax(Q @ K.transpose(-2, -1) / (self.embed_dim ** 0.5)) # QK^T / sqrt(d_k)
        print(scores[0])
        
    

In [108]:
att_layer = SelfAttention(embed_dim)

x_batch, y_batch = next(iter(dataloader))
src_emb = src_embedding_block(x_batch)
tgt_emb = tgt_embedding_block(y_batch)
att_layer.forward(src_emb)


tensor([[-5.2186e-01, -3.6626e+00,  3.5067e+00,  0.0000e+00, -4.5138e+00,
          9.5425e+00,  4.1403e-01,  2.3382e+00,  8.5562e+00,  0.0000e+00,
         -0.0000e+00,  5.2408e+00,  0.0000e+00,  4.0205e+00, -5.1358e+00,
          3.5088e-01],
        [-2.9718e+00,  6.6420e+00, -2.3001e+00, -1.5172e+00,  3.7740e+00,
          1.8741e+00, -1.7106e+00, -5.2552e+00, -0.0000e+00,  1.5264e+00,
          1.6031e+00,  1.6955e+00, -4.1848e+00, -5.1916e+00,  8.9527e+00,
          7.7908e+00],
        [-3.4834e+00, -2.5078e+00,  1.8389e+00,  1.3833e+00,  2.9731e+00,
          0.0000e+00, -7.0196e+00, -5.3497e-01, -0.0000e+00,  1.2361e-01,
          3.9371e+00,  4.2703e-01, -0.0000e+00,  1.8078e-01,  0.0000e+00,
         -1.5949e+00],
        [ 3.4584e+00, -2.7426e+00,  4.8643e+00,  1.5634e+00,  5.3393e+00,
         -1.9466e+00, -5.1369e+00,  4.0558e+00,  1.2547e+00, -5.4791e-01,
          1.9442e+00, -4.4357e+00, -1.2278e+00, -3.5357e+00, -1.0690e+00,
         -1.8907e+00],
        [-5.3346e+00

  scores = F.softmax(Q @ K.transpose(-2, -1) / (self.embed_dim ** 0.5)) # QK^T / sqrt(d_k)


In [112]:
a = torch.tensor([[4.1520e-01, 9.2172e-03, 3.3761e-01, 7.7127e-02, 9.9264e-01, 2.0378e-03,
         5.1287e-01, 9.8661e-01, 3.1220e-01, 7.5032e-07, 4.1305e-07, 1.4878e-01],
        [1.1322e-02, 1.9390e-01, 1.9214e-05, 1.3463e-05, 1.7323e-09, 3.5755e-01,
         1.8696e-02, 1.7061e-07, 1.6762e-02, 8.2477e-03, 9.9951e-01, 4.2487e-02],
        [9.9716e-01, 7.5766e-03, 4.8501e-03, 9.0242e-03, 2.9205e-06, 1.6198e-04,
         5.2688e-04, 1.2721e-07, 2.1545e-05, 1.4122e-03, 9.9999e-01, 6.3445e-04],
        [1.9669e-01, 2.0403e-04, 4.9065e-03, 1.3530e-05, 1.1502e-02, 2.0921e-03,
         2.9850e-02, 1.5834e-01, 9.3157e-05, 7.0591e-04, 9.7097e-01, 6.2658e-02],
        [5.0417e-04, 4.0546e-01, 7.3268e-05, 1.4528e-03, 9.8513e-03, 1.7056e-01,
         1.4156e-01, 2.2744e-03, 3.1547e-03, 2.1505e-04, 9.8825e-01, 2.0432e-01],
        [5.1036e-02, 5.0841e-02, 5.5043e-04, 2.5777e-03, 2.4590e-02, 7.4514e-02,
         1.1620e-01, 2.2663e-02, 9.1964e-02, 7.1103e-05, 2.3189e-06, 1.2055e-03],
        [1.4029e-03, 7.0205e-03, 1.3617e-04, 9.4526e-04, 1.2528e-07, 5.4257e-01,
         8.2813e-02, 2.3066e-01, 2.2557e-01, 2.0160e-03, 9.9531e-01, 1.5398e-01],
        [2.0098e-03, 4.7694e-01, 2.1897e-03, 7.2547e-03, 1.0958e-05, 3.7231e-01,
         6.5205e-06, 7.9622e-06, 1.7197e-02, 2.3918e-04, 9.9970e-01, 2.5197e-08],
        [1.3795e-01, 2.2282e-02, 4.5764e-04, 7.5546e-03, 7.2052e-01, 4.9537e-02,
         5.8559e-01, 9.8905e-01, 2.8492e-01, 1.1512e-04, 7.4283e-06, 3.4472e-02],
        [9.8395e-03, 2.9270e-03, 1.7909e-04, 2.4486e-04, 7.7027e-10, 3.6916e-01,
         8.4229e-03, 4.3075e-06, 1.9216e-02, 1.4292e-01, 9.9998e-01, 2.9361e-01],
        [7.7648e-01, 1.2907e-02, 4.3021e-01, 9.4967e-02, 9.8623e-04, 5.8526e-01,
         2.3055e-03, 4.6868e-07, 6.0341e-02, 1.2286e-02, 2.0148e-03, 6.7010e-05],
        [2.1241e-02, 6.3361e-02, 5.5370e-01, 1.1581e-04, 1.7084e-03, 3.0007e-03,
         2.6809e-02, 2.9881e-08, 5.1576e-02, 7.4330e-02, 3.5176e-04, 5.7024e-04]])

tensor = torch.tensor([
    [-5.3763, -19.0716, -10.7304, 16.2014, 25.0778, -29.9709, 14.1424, 47.5504, -3.9680, -12.2618, -53.3239, -0.6131],
    [28.7324, -9.4999, -6.8611, -18.2616, -27.7456, -1.1073, -23.4516, -56.3575, -16.1110, -13.3637, 59.1139, 0.8883],
    [33.2356, -12.3105, -2.6957, -10.5091, -21.9326, -11.1053, -16.4659, -49.8313, -20.0018, -13.4830, 47.4658, -1.2959],
    [6.2911, -16.4541, -5.6001, -1.2533, 9.2225, 6.3236, 4.9841, 13.6948, 5.7757, -15.1106, 23.0964, 18.0278],
    [0.9326, -1.4808, -2.6664, -6.2392, -4.2478, 13.2531, -0.6133, -20.2357, -6.1465, -2.5461, 49.7770, 9.5165],
    [-2.5958, -18.1085, -16.3908, 0.9187, 3.2732, 5.0221, -10.9709, -2.9974, 7.5171, -15.0100, -29.5982, -29.8554],
    [16.0027, -18.9854, -7.9330, -8.5611, -2.2667, 11.0441, -6.8223, -4.7966, 8.5525, -19.0153, 32.5196, 7.5894],
    [11.6683, 10.5559, 3.9735, -4.1875, -11.2270, 12.0608, -6.8960, -41.2959, -0.8020, 8.1903, 44.3070, -19.1285],
    [-1.0612, -17.3225, -17.5310, 8.7398, 22.2118, 6.8019, 1.0091, 27.6504, 18.2031, -12.3405, -25.6904, -16.0338],
    [26.6676, -5.4987, -4.8026, -16.8313, -27.4139, 5.4588, -19.6466, -55.2616, -11.5514, -9.9637, 67.6490, 4.1979],
    [5.6592, 0.4981, 0.8265, 1.3560, -6.6801, 18.6764, -8.1335, -19.9559, 8.5394, -1.0782, -2.8182, -6.0825],
    [11.7751, 3.8447, 12.8273, 3.6451, -13.5613, -25.4471, -6.3279, -36.0715, -15.5435, 4.4850, -15.7695, -45.6263]
])


F.softmax(tensor / (16 ** 0.5))

  F.softmax(tensor / (16 ** 0.5))


tensor([[1.7853e-06, 5.8178e-08, 4.6816e-07, 3.9307e-04, 3.6159e-03, 3.8140e-09,
         2.3492e-04, 9.9575e-01, 2.5387e-06, 3.1924e-07, 1.1114e-11, 5.8730e-06],
        [5.0252e-04, 3.5492e-08, 6.8649e-08, 3.9705e-09, 3.7079e-10, 2.8930e-07,
         1.0848e-09, 2.9016e-13, 6.7974e-09, 1.3509e-08, 9.9950e-01, 4.7645e-07],
        [2.7718e-02, 3.1453e-07, 3.4799e-06, 4.9345e-07, 2.8376e-08, 4.2512e-07,
         1.1130e-07, 2.6539e-11, 4.5982e-08, 2.3462e-07, 9.7227e-01, 4.9380e-06],
        [1.0220e-02, 3.4668e-05, 5.2286e-04, 1.5500e-03, 2.1268e-02, 1.0303e-02,
         7.3714e-03, 6.5058e-02, 8.9845e-03, 4.8506e-05, 6.8244e-01, 1.9220e-01],
        [4.9741e-06, 2.7207e-06, 2.0228e-06, 8.2803e-07, 1.3623e-06, 1.0824e-04,
         3.3796e-06, 2.5026e-08, 8.4744e-07, 2.0846e-06, 9.9983e-01, 4.2530e-05],
        [3.5563e-02, 7.3574e-04, 1.1304e-03, 8.5620e-02, 1.5425e-01, 2.3884e-01,
         4.3821e-03, 3.2166e-02, 4.4565e-01, 1.5964e-03, 4.1615e-05, 3.9023e-05],
        [1.5693e-02, 2

In [None]:
# class EncoderLayer(nn.Module):
#     def __init__(self,embed_dim, num_heads, ff_hidden_dim, dropout=0.1):
#         super.__init__()
#         self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
#         self.norm1 = nn.LayerNorm(embed_dim)
#         self.dropout1 = nn.Dropout(dropout)

#         self.ff = nn.Sequential(
#             nn.Linear(embed_dim, ff_hidden_dim),
#             nn.ReLU(),
#             nn.Linear(ff_hidden_dim, embed_dim),
#         )
#         self.norm2 = nn.LayerNorm(embed_dim)
#         self.dropout2 = nn.Dropout(dropout)