In [2]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import lightning as L
from datasets import load_dataset
from transformers import BasicTokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

In [2]:
ds = load_dataset("roneneldan/TinyStories")

Downloading data: 100%|██████████| 249M/249M [00:06<00:00, 40.3MB/s] 
Downloading data: 100%|██████████| 248M/248M [00:05<00:00, 45.8MB/s] 
Downloading data: 100%|██████████| 246M/246M [00:04<00:00, 56.1MB/s] 
Downloading data: 100%|██████████| 248M/248M [00:04<00:00, 52.6MB/s] 
Downloading data: 100%|██████████| 9.99M/9.99M [00:00<00:00, 17.5MB/s]
Generating train split: 100%|██████████| 2119719/2119719 [00:06<00:00, 307190.09 examples/s]
Generating validation split: 100%|██████████| 21990/21990 [00:00<00:00, 327844.23 examples/s]


## Tokenization


We cannot feed the sequences of words found in the dataset directly into the model. We need to translate each sequence into atomic units of language we call _tokens_.


It is important that the tokenizer used to train the model is also used for inference. If a different tokenizer is used then a word might be split in a way that is not expected by the model and will yield undesirable results. This is why in HuggingFace and other Machine Learning tools you will encounter tokenizers that are named after the model they are associated with (e.g. `T5Tokenizer`, `BERTTokenizer`, etc.). Just because a tokenizer was used to train a popular foundational language (such as BERT) doesn't mean you cannot use it for another NLP model if you are doing the pre-training of said model.


In [2]:
class Word2Vec(L.LightningModule):
    def __init__(self, vocab_size, embedding_dim):
        super("Word2Vec").__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.weight.data.uniform_(-1, 1)

    def forward(self, x):
        return self.embeddings(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat.view(-1, self.vocab_size), y.view(-1))
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())