In [1]:
import random

def generate_random_dates(n=10000):
    max_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    dataset = set()
    months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

    for _ in range(n):
        y = random.randint(1000, 2025)
        m = random.randint(1, 12)

        if (y % 4 == 0 and y % 100 != 0) or (y % 400 == 0):
            max_days[1] = 29
        else:
            max_days[1] = 28

        d = random.randint(1, max_days[m - 1])
        dataset.add((f"{y:04d}-{m:02d}-{d:02d}", f"{months[m - 1]} {d}, {y}"))

    return list(dataset)


In [2]:
import pandas as pd
dataset = generate_random_dates()
df = pd.DataFrame(dataset, columns=["x", "y"])
df

Unnamed: 0,x,y
0,1032-02-20,"February 20, 1032"
1,1262-10-21,"October 21, 1262"
2,1644-12-20,"December 20, 1644"
3,1216-08-31,"August 31, 1216"
4,1859-09-25,"September 25, 1859"
...,...,...
9875,1065-06-17,"June 17, 1065"
9876,1289-02-01,"February 1, 1289"
9877,1100-03-20,"March 20, 1100"
9878,1373-11-24,"November 24, 1373"


In [5]:
df["y"].apply(lambda r : len(r)).max()

np.int64(18)

In [3]:
class Tokenizer:
    def __init__(self):
        nums = [str(i) for i in range(10)]
        uppers = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
        lowers = [chr(i) for i in range(ord('a'), ord('z') + 1)]
        self.input_max = 10 + 2
        self.output_max = 18 + 2
        self.vocab = nums + uppers + lowers + ["-", ",", " ", "<sos>", "<eos>", "<pad>"]
        self.vocab_size = len(self.vocab)
        self.tokens_to_ids = {str(self.vocab[i]): i for i in range(len(self.vocab))}
        self.ids_to_tokens = {str(i): str(self.vocab[i]) for i in range(len(self.vocab))}

        self.pad_token_id = self.tokens_to_ids["<pad>"]

    def encode(self, sample):
        x, y = sample
        x = ["<sos>"] + list(x) + ["<eos>"]
        y = ["<sos>"] + list(y) + ["<eos>"]
        while len(x) != self.input_max: x.append("<pad>")
        while len(y) != self.output_max: y.append("<pad>")
        res_x = [self.tokens_to_ids[i] for i in x]
        res_y = [self.tokens_to_ids[i] for i in y]

        return (res_x, res_y)

    def decode(self, ids):
        res = [self.ids_to_tokens[str(i)] for i in ids]
        return "".join(res)

In [4]:
tokenizer = Tokenizer()
a = dataset[0]
encoded_x, encoded_y = tokenizer.encode(a)
tokenizer.decode(encoded_x), a[0], encoded_x, "-----", tokenizer.decode(encoded_y), a[1], encoded_y

('<sos>1032-02-20<eos>',
 '1032-02-20',
 [65, 1, 0, 3, 2, 62, 0, 2, 62, 2, 0, 66],
 '-----',
 '<sos>February 20, 1032<eos><pad>',
 'February 20, 1032',
 [65, 15, 40, 37, 53, 56, 36, 53, 60, 64, 2, 0, 63, 64, 1, 0, 3, 2, 66, 67])

In [5]:
tokenizer.vocab_size,tokenizer.tokens_to_ids["z"], a, encoded_x

(68,
 61,
 ('1032-02-20', 'February 20, 1032'),
 [65, 1, 0, 3, 2, 62, 0, 2, 62, 2, 0, 66])

In [6]:
import torch
from torch.utils.data import Dataset

class DateDataset(Dataset):
    def __init__(self,data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_encoded, y_encoded = self.tokenizer.encode(self.data[idx])
        
        x_tensor = torch.tensor(x_encoded, dtype=torch.long)
        y_tensor = torch.tensor(y_encoded, dtype=torch.long)

        return x_tensor, y_tensor
        

In [7]:
from torch.utils.data import DataLoader

date_dataset = DateDataset(dataset, tokenizer)
dataloader = DataLoader(date_dataset, batch_size=5, shuffle=True)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
    print(x_batch)
    print(y_batch)
    break

tensor([[65,  2,  0,  0,  4, 62,  0,  4, 62,  1,  1, 66],
        [65,  1,  9,  5,  0, 62,  0,  7, 62,  0,  4, 66],
        [65,  1,  6,  7,  3, 62,  1,  2, 62,  0,  9, 66],
        [65,  1,  4,  3,  3, 62,  1,  2, 62,  1,  7, 66],
        [65,  1,  2,  9,  5, 62,  0,  3, 62,  2,  5, 66]])
tensor([[65, 10, 51, 53, 44, 47, 64,  1,  1, 63, 64,  2,  0,  0,  4, 66, 67, 67,
         67, 67],
        [65, 19, 56, 47, 60, 64,  4, 63, 64,  1,  9,  5,  0, 66, 67, 67, 67, 67,
         67, 67],
        [65, 13, 40, 38, 40, 48, 37, 40, 53, 64,  9, 63, 64,  1,  6,  7,  3, 66,
         67, 67],
        [65, 13, 40, 38, 40, 48, 37, 40, 53, 64,  1,  7, 63, 64,  1,  4,  3,  3,
         66, 67],
        [65, 22, 36, 53, 38, 43, 64,  2,  5, 63, 64,  1,  2,  9,  5, 66, 67, 67,
         67, 67]])


In [8]:
import torch
import torch.nn as nn

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, token_ids):
        return self.embedding(token_ids)

In [9]:
import torch
import torch.nn as nn
import math
class PositionalEncoder(nn.Module):
    def __init__(self, embed_dim, max_len, n = 10000):
        super().__init__()
        self.n = n
        pe = torch.zeros(max_len, embed_dim)
        positions = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(self.n) / embed_dim))
                
        pe[:, 0::2] = torch.sin(positions * div_term)
        pe[:, 1::2] = torch.cos(positions * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        pe = self.pe[:seq_len, :]
        pe = pe.unsqueeze(0)
        pe = pe.expand(x.size(0), -1, -1)
        # print(x[0], pe[0])
        return x + pe

In [10]:
import math


class EmbeddingBlock(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len, pad_id, dropout=0.1):
        super().__init__()
        self.token_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_emb = PositionalEncoder(embed_dim, max_len)
        self.dropout = nn.Dropout(dropout)
        self.embed_dim = embed_dim

    def forward(self, token_ids):
        tok_emb = self.token_emb(token_ids) * math.sqrt(self.embed_dim)
        # print("te",tok_emb)
        x = self.pos_emb(tok_emb)
        # print("pe", x)
        return self.dropout(x)


In [115]:
toy = torch.tensor([[65, 0, 62, 65]])
src_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)


In [116]:
src_emb = src_embedding_block(toy)


te tensor([[[-1.0169,  3.7117, -3.0031,  0.0366, -2.0964, -4.5249, -5.8190,
           0.1423,  6.3528, -5.9342, -2.7633, -6.5703, -1.8161, -1.1127,
          -3.3698,  1.6234],
         [ 2.5321, -0.1290, -0.9900, -3.7510,  4.6948,  0.8859, -1.5571,
          -2.4139, -8.1228,  1.2832, -0.7425,  1.6549, -2.8551,  2.4007,
           5.5863,  6.4642],
         [-1.2332, -1.0142, -4.4233,  1.6503,  0.4995,  2.1766, -4.7730,
          -1.4999,  3.7029,  0.3568,  4.4813, -2.4508, -3.3141,  8.0293,
          -2.5632, -2.9621],
         [-1.0169,  3.7117, -3.0031,  0.0366, -2.0964, -4.5249, -5.8190,
           0.1423,  6.3528, -5.9342, -2.7633, -6.5703, -1.8161, -1.1127,
          -3.3698,  1.6234]]], grad_fn=<MulBackward0>)
pe tensor([[[-1.0169,  4.7117, -3.0031,  1.0366, -2.0964, -3.5249, -5.8190,
           1.1423,  6.3528, -4.9342, -2.7633, -5.5703, -1.8161, -0.1127,
          -3.3698,  2.6234],
         [ 3.3736,  0.4113, -0.6790, -2.8006,  4.7947,  1.8809, -1.5255,
          -1.4144, -

In [None]:
embed_dim = 16
max_seq_len = 20

src_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)
tgt_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader[:5]):
    src_emb = src_embedding_block(x_batch)
    tgt_emb = tgt_embedding_block(y_batch)
    print(src_emb.shape, tgt_emb.shape)
    break

# Attention

In [11]:
import pandas as pd
from torch.utils.data import DataLoader

dataset = generate_random_dates()
df = pd.DataFrame(dataset, columns=["x", "y"])

date_dataset = DateDataset(dataset, tokenizer)
dataloader = DataLoader(date_dataset, batch_size=5, shuffle=True)

In [12]:
embed_dim = 16
max_seq_len = 20

src_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)
tgt_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
    src_emb = src_embedding_block(x_batch)
    tgt_emb = tgt_embedding_block(y_batch)
    print(src_emb.shape, tgt_emb.shape)
    break

torch.Size([5, 12, 16]) torch.Size([5, 20, 16])


In [13]:
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.embed_dim = embed_dim

        self.W_q = nn.Linear(embed_dim, embed_dim)
        self.W_k = nn.Linear(embed_dim, embed_dim)
        self.W_v = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        # print(x[0])
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)
        print(Q[0])
        print(K.transpose(-2, -1)[0])
        

        scores = Q @ K.transpose(-2, -1) # QK^T / sqrt(d_k)

        # print(scores[0])
        scores = F.softmax(Q @ K.transpose(-2, -1) / (self.embed_dim ** 0.5)) # QK^T / sqrt(d_k)
        print(scores[0])
        
    

In [14]:
att_layer = SelfAttention(embed_dim)

x_batch, y_batch = next(iter(dataloader))
src_emb = src_embedding_block(x_batch)
tgt_emb = tgt_embedding_block(y_batch)
att_layer.forward(src_emb)


tensor([[ 0.6077, -1.1013,  2.7406, -0.7540,  0.3023,  2.9211,  2.6975,  1.8287,
         -0.9295, -3.0172,  3.9800, -0.2289, -1.1133,  3.7010, -1.7028, -4.0110],
        [ 3.2174, -1.6857,  1.7360,  0.3697,  0.4990,  1.3540,  1.5095, -3.2122,
          1.8186,  0.1438,  0.2113,  0.7156,  1.5320,  3.8769,  0.6799, -3.2036],
        [ 1.1522,  0.8753, -0.4475,  2.6429,  1.7090, -0.9089,  1.1222,  1.4286,
          0.2103,  3.7216, -1.0529, -0.7472, -5.0433, -1.0415,  5.4696,  3.8084],
        [-2.2415,  0.5077, -5.3882,  2.1771, -2.9984, -0.9999, -1.8916,  0.7201,
         -1.6512,  2.2896, -1.5486,  0.1555,  2.3028, -2.5760, -3.7443,  0.6497],
        [ 0.1556,  0.3669, -1.1607,  3.9464,  0.2232,  0.4521,  0.1981,  1.3263,
         -2.4108,  3.6452,  0.7514, -0.3106, -2.8094, -1.3884,  2.5923,  1.2330],
        [ 0.6471, -2.3877, -3.4905, -4.1567,  0.8856,  2.9045,  1.3073, -4.2904,
         -2.8911,  0.8365, -2.4124, -0.9344, -5.5669, -0.1383, -5.1528,  3.6730],
        [ 0.7403, -1.9

  scores = F.softmax(Q @ K.transpose(-2, -1) / (self.embed_dim ** 0.5)) # QK^T / sqrt(d_k)


In [15]:
tensor = torch.tensor([
    [-5.3763, -19.0716, -10.7304, 16.2014, 25.0778, -29.9709, 14.1424, 47.5504, -3.9680, -12.2618, -53.3239, -0.6131],
    [28.7324, -9.4999, -6.8611, -18.2616, -27.7456, -1.1073, -23.4516, -56.3575, -16.1110, -13.3637, 59.1139, 0.8883],
    [33.2356, -12.3105, -2.6957, -10.5091, -21.9326, -11.1053, -16.4659, -49.8313, -20.0018, -13.4830, 47.4658, -1.2959],
    [6.2911, -16.4541, -5.6001, -1.2533, 9.2225, 6.3236, 4.9841, 13.6948, 5.7757, -15.1106, 23.0964, 18.0278],
    [0.9326, -1.4808, -2.6664, -6.2392, -4.2478, 13.2531, -0.6133, -20.2357, -6.1465, -2.5461, 49.7770, 9.5165],
    [-2.5958, -18.1085, -16.3908, 0.9187, 3.2732, 5.0221, -10.9709, -2.9974, 7.5171, -15.0100, -29.5982, -29.8554],
    [16.0027, -18.9854, -7.9330, -8.5611, -2.2667, 11.0441, -6.8223, -4.7966, 8.5525, -19.0153, 32.5196, 7.5894],
    [11.6683, 10.5559, 3.9735, -4.1875, -11.2270, 12.0608, -6.8960, -41.2959, -0.8020, 8.1903, 44.3070, -19.1285],
    [-1.0612, -17.3225, -17.5310, 8.7398, 22.2118, 6.8019, 1.0091, 27.6504, 18.2031, -12.3405, -25.6904, -16.0338],
    [26.6676, -5.4987, -4.8026, -16.8313, -27.4139, 5.4588, -19.6466, -55.2616, -11.5514, -9.9637, 67.6490, 4.1979],
    [5.6592, 0.4981, 0.8265, 1.3560, -6.6801, 18.6764, -8.1335, -19.9559, 8.5394, -1.0782, -2.8182, -6.0825],
    [11.7751, 3.8447, 12.8273, 3.6451, -13.5613, -25.4471, -6.3279, -36.0715, -15.5435, 4.4850, -15.7695, -45.6263]
])

V = torch.tensor([[ 0.0523, -0.3169,  1.0038,  3.8029,  2.8152,  0.5570, -2.9845,  1.6947,
         -2.7567, -2.1093,  2.7524,  0.4722,  1.6745,  1.0933,  3.1662,  3.1196],
        [ 0.3888, -0.2766,  3.2932,  1.1321,  0.7103,  2.4821, -3.0863, -2.4968,
         -4.2470,  0.0247, -1.5927,  1.8743, -1.3891,  4.1220, -0.6374, -0.6978],
        [ 0.1997, -0.4959,  3.5477,  0.9209,  0.8466,  2.6905, -2.9755, -2.7062,
         -4.1371,  0.1154, -1.9207,  1.8367, -1.2775,  4.0197, -0.4618, -0.7433],
        [ 4.3301, -1.1908, -1.9410,  0.7849, -0.0856,  2.9224, -0.0149, -1.3127,
         -3.1154, -3.8022,  0.1690,  4.6527,  3.1757,  0.1667,  1.6034, -1.5607],
        [-1.2611, -3.7500,  4.4421,  0.0624,  0.6921,  2.0781,  3.5267,  1.5235,
         -3.4005, -3.0553, -0.2118,  1.6587, -1.7829,  0.5463,  2.3267,  4.3869],
        [-2.1013,  1.1058, -0.4530, -4.4448, -0.0611, -2.5772, -3.5890, -3.7889,
         -1.7608,  2.1861,  0.4290, -1.4026,  2.0943, -3.5062,  0.1753, -2.6732],
        [ 0.6525, -0.6348,  3.5759,  1.3017, -0.1698,  2.7247, -2.7032, -2.0993,
         -4.3713, -0.3325, -2.0410,  2.5782, -1.6670,  4.3540, -0.9295, -1.0574],
        [-3.9596, -0.5530,  2.6572,  0.2565, -2.0472, -1.4750, -2.1472, -1.0649,
         -3.5263,  1.6453, -2.2381, -0.0380, -1.5689,  0.1930,  2.5177,  0.4268],
        [-1.9230,  1.6798, -2.2871, -6.8718,  3.1228, -3.3532, -5.0108, -5.6930,
          0.4687,  1.3645,  2.1050, -2.3289,  2.7139, -3.6273, -0.4498, -1.8997],
        [-0.2357, -1.2845,  2.3008, -0.4310, -0.5731,  2.0083, -2.0417, -0.6014,
         -2.4623, -2.5855, -1.1960,  2.5856, -0.0853,  2.5082,  0.3033, -0.8056],
        [ 5.9674, -3.6949, -3.9467, -2.4978,  1.6004,  1.8320, -0.7655, -2.8905,
          2.5294, -1.2686, -0.9696,  0.9364,  6.0051, -4.1793,  3.2551, -1.7872],
        [ 1.8519, -4.3785, -5.7870, -2.8644, -8.4280, -2.5224,  3.6461,  0.2399,
         -2.5877, -0.3564,  4.7782, -3.0841, -0.6588, -3.3774, -0.0579,  0.5228]])

qk = F.softmax(tensor / (16 ** 0.5))
att_output = qk @ V

mlp =nn.Sequential(nn.Linear(embed_dim, 16), nn.ReLU(),nn.Linear(16,embed_dim))

mlp(att_output).shape

  qk = F.softmax(tensor / (16 ** 0.5))


torch.Size([12, 16])

In [None]:
# class EncoderLayer(nn.Module):
#     def __init__(self,embed_dim, num_heads, ff_hidden_dim, dropout=0.1):
#         super.__init__()
#         self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
#         self.norm1 = nn.LayerNorm(embed_dim)
#         self.dropout1 = nn.Dropout(dropout)

#         self.ff = nn.Sequential(
#             nn.Linear(embed_dim, ff_hidden_dim),
#             nn.ReLU(),
#             nn.Linear(ff_hidden_dim, embed_dim),
#         )
#         self.norm2 = nn.LayerNorm(embed_dim)
#         self.dropout2 = nn.Dropout(dropout)