In [1]:
import random

def generate_random_dates(n=10000):
    max_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    dataset = set()
    months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

    for _ in range(n):
        y = random.randint(1000, 2025)
        m = random.randint(1, 12)

        if (y % 4 == 0 and y % 100 != 0) or (y % 400 == 0):
            max_days[1] = 29
        else:
            max_days[1] = 28

        d = random.randint(1, max_days[m - 1])
        dataset.add((f"{y:04d}-{m:02d}-{d:02d}", f"{months[m - 1]} {d}, {y}"))

    return list(dataset)


In [2]:
import pandas as pd
dataset = generate_random_dates()
df = pd.DataFrame(dataset, columns=["x", "y"])
df

Unnamed: 0,x,y
0,1026-09-15,"September 15, 1026"
1,1075-12-16,"December 16, 1075"
2,1685-04-12,"April 12, 1685"
3,1658-11-22,"November 22, 1658"
4,1681-01-01,"January 1, 1681"
...,...,...
9878,1029-11-19,"November 19, 1029"
9879,1567-08-11,"August 11, 1567"
9880,1299-04-15,"April 15, 1299"
9881,1272-04-30,"April 30, 1272"


In [3]:
df["y"].apply(lambda r : len(r)).max()

np.int64(18)

In [4]:
class Tokenizer:
    def __init__(self):
        nums = [str(i) for i in range(32)]
        uppers = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
        lowers = [chr(i) for i in range(ord('a'), ord('z') + 1)]
        self.input_max = 10 + 2
        self.output_max = 18 + 2
        self.vocab = nums + uppers + lowers + ["-", ",", " ", "<sos>", "<eos>", "<pad>"]
        self.vocab_size = len(self.vocab)
        self.tokens_to_ids = {str(self.vocab[i]): i for i in range(len(self.vocab))}
        self.ids_to_tokens = {str(i): str(self.vocab[i]) for i in range(len(self.vocab))}

    def encode(self, sample):
        x, y = sample
        x = ["<sos>"] + list(x) + ["<eos>"]
        y = ["<sos>"] + list(y) + ["<eos>"]
        while len(x) != self.input_max: x.append("<pad>")
        while len(y) != self.output_max: y.append("<pad>")
        res_x = [self.tokens_to_ids[i] for i in x]
        res_y = [self.tokens_to_ids[i] for i in y]

        return (res_x, res_y)

    def decode(self, ids):
        res = [self.ids_to_tokens[str(i)] for i in ids]
        return "".join(res)

In [5]:
tokenizer = Tokenizer()
a = dataset[0]
encoded_x, encoded_y = tokenizer.encode(a)
tokenizer.decode(encoded_x), a[0]

('<sos>1026-09-15<eos>', '1026-09-15')

In [6]:
encoded_x

[87, 1, 0, 2, 6, 84, 0, 9, 84, 1, 5, 88]

In [7]:
import torch
from torch.utils.data import Dataset

class DateDataset(Dataset):
    def __init__(self,data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_encoded, y_encoded = self.tokenizer.encode(self.data[idx])
        
        x_tensor = torch.tensor(x_encoded, dtype=torch.long)
        y_tensor = torch.tensor(y_encoded, dtype=torch.long)

        return x_tensor, y_tensor
        

In [8]:
from torch.utils.data import DataLoader

date_dataset = DateDataset(dataset, tokenizer)
dataloader = DataLoader(date_dataset, batch_size=5, shuffle=True)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
    print(x_batch)
    print(y_batch)
    break

tensor([[87,  1,  4,  3,  2, 84,  1,  2, 84,  1,  4, 88],
        [87,  1,  7,  4,  9, 84,  0,  7, 84,  0,  6, 88],
        [87,  1,  1,  2,  1, 84,  0,  5, 84,  1,  6, 88],
        [87,  1,  7,  4,  8, 84,  0,  2, 84,  2,  9, 88],
        [87,  1,  6,  7,  3, 84,  1,  1, 84,  1,  4, 88]])
tensor([[87, 35, 62, 60, 62, 70, 59, 62, 75, 86,  1,  4, 85, 86,  1,  4,  3,  2,
         88, 89],
        [87, 41, 78, 69, 82, 86,  6, 85, 86,  1,  7,  4,  9, 88, 89, 89, 89, 89,
         89, 89],
        [87, 44, 58, 82, 86,  1,  6, 85, 86,  1,  1,  2,  1, 88, 89, 89, 89, 89,
         89, 89],
        [87, 37, 62, 59, 75, 78, 58, 75, 82, 86,  2,  9, 85, 86,  1,  7,  4,  8,
         88, 89],
        [87, 45, 72, 79, 62, 70, 59, 62, 75, 86,  1,  4, 85, 86,  1,  6,  7,  3,
         88, 89]])


In [9]:
import torch
import torch.nn as nn

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, token_ids):
        return self.embedding(token_ids)

In [21]:
import torch
import torch.nn as nn
import math
class PositionalEncoder(nn.Module):
    def __init__(self, embed_dim, max_len, n = 10000):
        super().__init__()
        self.n = n
        pe = torch.zeros(max_len, embed_dim)
        positions = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(self.n) / embed_dim))
                
        pe[:, 0::2] = torch.sin(positions * div_term)
        pe[:, 1::2] = torch.cos(positions * div_term)
        print(pe.shape)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        pe = self.pe[:seq_len, :]
        pe = pe.unsqueeze(0)
        pe = pe.expand(x.size(0), -1, -1)
        return x + pe

In [22]:
embed_dim = 16
max_seq_len = 20
token_embedding_layer = TokenEmbedding(tokenizer.vocab_size, embed_dim)
positional_encoding_layer = PositionalEncoder(embed_dim, max_seq_len)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
    token_vectors = token_embedding_layer(x_batch)
    print(token_vectors.shape)
    input_vectors = positional_encoding_layer(token_vectors)
    print(input_vectors.shape)
    break

torch.Size([20, 16])
torch.Size([5, 12, 16])
torch.Size([5, 12, 16])
