In [3]:
import random

def generate_random_dates(n=10000):
    max_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    dataset = set()
    months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

    for _ in range(n):
        y = random.randint(1000, 2025)
        m = random.randint(1, 12)

        if (y % 4 == 0 and y % 100 != 0) or (y % 400 == 0):
            max_days[1] = 29
        else:
            max_days[1] = 28

        d = random.randint(1, max_days[m - 1])
        dataset.add((f"{y:04d}-{m:02d}-{d:02d}", f"{months[m - 1]} {d}, {y}"))

    return list(dataset)


In [4]:
import pandas as pd
dataset = generate_random_dates()
df = pd.DataFrame(dataset, columns=["x", "y"])
df

Unnamed: 0,x,y
0,1676-11-30,"November 30, 1676"
1,1162-02-03,"February 3, 1162"
2,1394-03-23,"March 23, 1394"
3,1616-07-06,"July 6, 1616"
4,1647-05-03,"May 3, 1647"
...,...,...
9849,1577-03-14,"March 14, 1577"
9850,1763-05-23,"May 23, 1763"
9851,1998-08-29,"August 29, 1998"
9852,1087-10-21,"October 21, 1087"


In [5]:
df["y"].apply(lambda r : len(r)).max()

np.int64(18)

In [9]:
class Tokenizer:
    def __init__(self):
        nums = [str(i) for i in range(10)]
        uppers = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
        lowers = [chr(i) for i in range(ord('a'), ord('z') + 1)]
        self.input_max = 10 + 2
        self.output_max = 18 + 2
        self.vocab = nums + uppers + lowers + ["-", ",", " ", "<sos>", "<eos>", "<pad>"]
        self.vocab_size = len(self.vocab)
        self.tokens_to_ids = {str(self.vocab[i]): i for i in range(len(self.vocab))}
        self.ids_to_tokens = {str(i): str(self.vocab[i]) for i in range(len(self.vocab))}

        self.pad_token_id = self.tokens_to_ids["<pad>"]

    def encode(self, sample):
        x, y = sample
        x = ["<sos>"] + list(x) + ["<eos>"]
        y = ["<sos>"] + list(y) + ["<eos>"]
        while len(x) != self.input_max: x.append("<pad>")
        while len(y) != self.output_max: y.append("<pad>")
        res_x = [self.tokens_to_ids[i] for i in x]
        res_y = [self.tokens_to_ids[i] for i in y]

        return (res_x, res_y)

    def decode(self, ids):
        res = [self.ids_to_tokens[str(i)] for i in ids]
        return "".join(res)

In [38]:
tokenizer = Tokenizer()
a = dataset[0]
encoded_x, encoded_y = tokenizer.encode(a)
tokenizer.decode(encoded_x), a[0], encoded_x, "-----", tokenizer.decode(encoded_y), a[1], encoded_y

('<sos>1676-11-30<eos>',
 '1676-11-30',
 [65, 1, 6, 7, 6, 62, 1, 1, 62, 3, 0, 66],
 '-----',
 '<sos>November 30, 1676<eos><pad>',
 'November 30, 1676',
 [65, 23, 50, 57, 40, 48, 37, 40, 53, 64, 3, 0, 63, 64, 1, 6, 7, 6, 66, 67])

In [23]:
tokenizer.vocab_size,tokenizer.tokens_to_ids["z"], a, encoded_x

(68,
 61,
 ('1676-11-30', 'November 30, 1676'),
 [65, 1, 6, 7, 6, 62, 1, 1, 62, 3, 0, 66])

In [12]:
import torch
from torch.utils.data import Dataset

class DateDataset(Dataset):
    def __init__(self,data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_encoded, y_encoded = self.tokenizer.encode(self.data[idx])
        
        x_tensor = torch.tensor(x_encoded, dtype=torch.long)
        y_tensor = torch.tensor(y_encoded, dtype=torch.long)

        return x_tensor, y_tensor
        

In [39]:
from torch.utils.data import DataLoader

date_dataset = DateDataset(dataset, tokenizer)
dataloader = DataLoader(date_dataset, batch_size=5, shuffle=True)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
    print(x_batch)
    print(y_batch)
    break

tensor([[65,  1,  8,  4,  9, 62,  0,  2, 62,  2,  0, 66],
        [65,  1,  5,  8,  5, 62,  0,  2, 62,  0,  2, 66],
        [65,  1,  8,  0,  7, 62,  0,  9, 62,  2,  2, 66],
        [65,  1,  0,  9,  3, 62,  0,  9, 62,  0,  6, 66],
        [65,  1,  3,  1,  9, 62,  0,  2, 62,  1,  2, 66]])
tensor([[65, 15, 40, 37, 53, 56, 36, 53, 60, 64,  2,  0, 63, 64,  1,  8,  4,  9,
         66, 67],
        [65, 15, 40, 37, 53, 56, 36, 53, 60, 64,  2, 63, 64,  1,  5,  8,  5, 66,
         67, 67],
        [65, 28, 40, 51, 55, 40, 48, 37, 40, 53, 64,  2,  2, 63, 64,  1,  8,  0,
          7, 66],
        [65, 28, 40, 51, 55, 40, 48, 37, 40, 53, 64,  6, 63, 64,  1,  0,  9,  3,
         66, 67],
        [65, 15, 40, 37, 53, 56, 36, 53, 60, 64,  1,  2, 63, 64,  1,  3,  1,  9,
         66, 67]])


In [40]:
import torch
import torch.nn as nn

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, token_ids):
        return self.embedding(token_ids)

In [99]:
import torch
import torch.nn as nn
import math
class PositionalEncoder(nn.Module):
    def __init__(self, embed_dim, max_len, n = 10000):
        super().__init__()
        self.n = n
        pe = torch.zeros(max_len, embed_dim)
        positions = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(self.n) / embed_dim))
                
        pe[:, 0::2] = torch.sin(positions * div_term)
        pe[:, 1::2] = torch.cos(positions * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        pe = self.pe[:seq_len, :]
        pe = pe.unsqueeze(0)
        pe = pe.expand(x.size(0), -1, -1)
        # print(x[0], pe[0])
        return x + pe

In [114]:
import math


class EmbeddingBlock(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len, pad_id, dropout=0.1):
        super().__init__()
        self.token_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_emb = PositionalEncoder(embed_dim, max_len)
        self.dropout = nn.Dropout(dropout)
        self.embed_dim = embed_dim

    def forward(self, token_ids):
        tok_emb = self.token_emb(token_ids) * math.sqrt(self.embed_dim)
        print("te",tok_emb)
        x = self.pos_emb(tok_emb)
        print("pe", x)
        return self.dropout(x)


In [115]:
toy = torch.tensor([[65, 0, 62, 65]])
src_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)


In [116]:
src_emb = src_embedding_block(toy)


te tensor([[[-1.0169,  3.7117, -3.0031,  0.0366, -2.0964, -4.5249, -5.8190,
           0.1423,  6.3528, -5.9342, -2.7633, -6.5703, -1.8161, -1.1127,
          -3.3698,  1.6234],
         [ 2.5321, -0.1290, -0.9900, -3.7510,  4.6948,  0.8859, -1.5571,
          -2.4139, -8.1228,  1.2832, -0.7425,  1.6549, -2.8551,  2.4007,
           5.5863,  6.4642],
         [-1.2332, -1.0142, -4.4233,  1.6503,  0.4995,  2.1766, -4.7730,
          -1.4999,  3.7029,  0.3568,  4.4813, -2.4508, -3.3141,  8.0293,
          -2.5632, -2.9621],
         [-1.0169,  3.7117, -3.0031,  0.0366, -2.0964, -4.5249, -5.8190,
           0.1423,  6.3528, -5.9342, -2.7633, -6.5703, -1.8161, -1.1127,
          -3.3698,  1.6234]]], grad_fn=<MulBackward0>)
pe tensor([[[-1.0169,  4.7117, -3.0031,  1.0366, -2.0964, -3.5249, -5.8190,
           1.1423,  6.3528, -4.9342, -2.7633, -5.5703, -1.8161, -0.1127,
          -3.3698,  2.6234],
         [ 3.3736,  0.4113, -0.6790, -2.8006,  4.7947,  1.8809, -1.5255,
          -1.4144, -

In [73]:
embed_dim = 16
max_seq_len = 20

src_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)
tgt_embedding_block = EmbeddingBlock(tokenizer.vocab_size, embed_dim, max_seq_len, pad_id=tokenizer.pad_token_id)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
    src_emb = src_embedding_block(x_batch)
    tgt_emb = tgt_embedding_block(y_batch)
    print(src_emb.shape, tgt_emb.shape)
    break

tensor([[[-3.8836e+00, -5.1854e+00, -6.5783e+00, -2.1027e+00, -1.6683e+00,
           1.6579e+00,  3.7345e-01, -5.2522e+00, -3.4867e+00,  2.9524e-01,
           1.1662e+01,  3.9204e+00,  1.1566e-01,  3.2059e+00,  9.8642e-01,
          -6.1893e+00],
         [ 5.9023e+00, -5.5386e-02,  3.2776e+00, -5.1518e+00,  3.4751e+00,
           1.9231e+00,  4.2758e+00, -2.0947e+00,  1.0829e+01,  6.7652e+00,
          -9.1369e+00, -1.0324e+01,  4.5054e+00, -5.1078e+00, -4.8536e+00,
          -7.4253e-01],
         [ 4.2574e+00, -1.0843e-01,  3.7641e+00, -4.0817e+00, -1.9741e+00,
          -1.0880e-01,  6.6001e+00,  9.1309e-01, -5.4415e+00,  7.4351e-01,
          -2.9445e+00,  1.0054e+00,  2.0926e+00,  5.4782e+00, -6.9436e+00,
           2.9030e+00],
         [-1.7147e+00,  4.9592e-01, -1.1170e+00,  4.5824e+00,  6.7641e+00,
          -3.7226e+00,  1.9317e+00, -6.1815e+00, -5.9378e+00, -8.5863e-02,
           5.1999e+00, -1.4590e+00, -2.2803e+00,  2.8240e+00,  3.1727e+00,
           6.5920e+00],
    

In [None]:
# class SelfAttention(nn.Module):
#     super.__init__(self, embed_dim):
#     self.scale = embed_dim ** 0.5

#     def forward(self, Q, K, V, mask = None):
        
    

In [None]:
# class EncoderLayer(nn.Module):
#     def __init__(self,embed_dim, num_heads, ff_hidden_dim, dropout=0.1):
#         super.__init__()
#         self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
#         self.norm1 = nn.LayerNorm(embed_dim)
#         self.dropout1 = nn.Dropout(dropout)

#         self.ff = nn.Sequential(
#             nn.Linear(embed_dim, ff_hidden_dim),
#             nn.ReLU(),
#             nn.Linear(ff_hidden_dim, embed_dim),
#         )
#         self.norm2 = nn.LayerNorm(embed_dim)
#         self.dropout2 = nn.Dropout(dropout)