In [162]:
import random

def generate_random_dates(n=10000):
    max_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    dataset = set()
    months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

    for _ in range(n):
        y = random.randint(1000, 2025)
        m = random.randint(1, 12)

        if (y % 4 == 0 and y % 100 != 0) or (y % 400 == 0):
            max_days[1] = 29
        else:
            max_days[1] = 28

        d = random.randint(1, max_days[m - 1])
        dataset.add((f"{y:04d}-{m:02d}-{d:02d}", f"{months[m - 1]} {d}, {y}"))

    return list(dataset)


In [163]:
import pandas as pd
dataset = generate_random_dates()
df = pd.DataFrame(dataset, columns=["x", "y"])
df

Unnamed: 0,x,y
0,1312-03-20,"March 20, 1312"
1,1294-10-06,"October 6, 1294"
2,1777-06-19,"June 19, 1777"
3,1622-10-26,"October 26, 1622"
4,1592-02-21,"February 21, 1592"
...,...,...
9863,1358-10-30,"October 30, 1358"
9864,1033-05-05,"May 5, 1033"
9865,1083-10-21,"October 21, 1083"
9866,1475-06-18,"June 18, 1475"


In [164]:
df["y"].apply(lambda r : len(r)).max()

np.int64(18)

In [165]:
class Tokenizer:
    def __init__(self):
        nums = [str(i) for i in range(32)]
        uppers = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
        lowers = [chr(i) for i in range(ord('a'), ord('z') + 1)]
        self.input_max = 10 + 2
        self.output_max = 18 + 2
        self.vocab = nums + uppers + lowers + ["-", ",", " ", "<sos>", "<eos>", "<pad>"]
        self.tokens_to_ids = {str(self.vocab[i]): i for i in range(len(self.vocab))}
        self.ids_to_tokens = {str(i): str(self.vocab[i]) for i in range(len(self.vocab))}

    def encode(self, sample):
        x, y = sample
        x = ["<sos>"] + list(x) + ["<eos>"]
        y = ["<sos>"] + list(y) + ["<eos>"]
        while len(x) != self.input_max: x.append("<pad>")
        while len(y) != self.output_max: y.append("<pad>")
        res_x = [self.tokens_to_ids[i] for i in x]
        res_y = [self.tokens_to_ids[i] for i in y]

        return (res_x, res_y)

    def decode(self, ids):
        res = [self.ids_to_tokens[str(i)] for i in ids]
        return "".join(res)

In [166]:
tokenizer = Tokenizer()
a = dataset[0]
encoded_x, encoded_y = tokenizer.encode(a)
tokenizer.decode(encoded_x), a[0]

('<sos>1312-03-20<eos>', '1312-03-20')

In [167]:
encoded_x

[87, 1, 3, 1, 2, 84, 0, 3, 84, 2, 0, 88]

In [168]:
import torch
from torch.utils.data import Dataset

class DateDataset(Dataset):
    def __init__(self,data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_encoded, y_encoded = self.tokenizer.encode(self.data[idx])
        
        x_tensor = torch.tensor(x_encoded, dtype=torch.long)
        y_tensor = torch.tensor(y_encoded, dtype=torch.long)

        return x_tensor, y_tensor
        

In [169]:
from torch.utils.data import DataLoader

date_dataset = DateDataset(dataset, tokenizer)
dataloader = DataLoader(date_dataset, batch_size=5, shuffle=True)

for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
    print(x_batch)
    print(y_batch)
    break

tensor([[87,  2,  0,  1,  3, 84,  0,  1, 84,  1,  8, 88],
        [87,  1,  4,  1,  9, 84,  0,  5, 84,  1,  3, 88],
        [87,  1,  2,  7,  3, 84,  1,  2, 84,  0,  5, 88],
        [87,  1,  5,  8,  4, 84,  0,  3, 84,  0,  6, 88],
        [87,  1,  6,  5,  9, 84,  0,  5, 84,  1,  4, 88]])
tensor([[87, 41, 58, 71, 78, 58, 75, 82, 86,  1,  8, 85, 86,  2,  0,  1,  3, 88,
         89, 89],
        [87, 44, 58, 82, 86,  1,  3, 85, 86,  1,  4,  1,  9, 88, 89, 89, 89, 89,
         89, 89],
        [87, 35, 62, 60, 62, 70, 59, 62, 75, 86,  5, 85, 86,  1,  2,  7,  3, 88,
         89, 89],
        [87, 44, 58, 75, 60, 65, 86,  6, 85, 86,  1,  5,  8,  4, 88, 89, 89, 89,
         89, 89],
        [87, 44, 58, 82, 86,  1,  4, 85, 86,  1,  6,  5,  9, 88, 89, 89, 89, 89,
         89, 89]])
