# PyTorch🔥Embeddings - Build Word2vec CBOW Model (wikitext3 Dataset)

Subject: Building a Word2vec-like CBOW Model to create embeddings for the dataset's words representing their near closeness to each other (in 300 embedding dimensions). Preprocessing fails on Colab due to data size, therefore we process data as <b>iterators</b>, hot having the whole dataset in memory at any time.

Data: Wikitext3 from torch as DataPipe (beta)

Procedure:

- Creating a vocabulary with torchtext.vocab.build_vocab_from_iterator
- Tokenizing with torchtext.data.utils.get_tokenizer and nltk.corpus.stopwords
- Creating contexts and targets from five words each: (01 34) with (2) as target
- Tensorizing contexts and targets
- Creating a custom torch.utils.data.Dataset for a torch.utils.data.DataLoader
- Word2vec-like CBOW model with torch.nn.module, torch.nn.Embedding, torch.nn.Linear, torch.nn.ReLU, and torch.nn.LogSoftmax
- Training with torch.nn.NLLLoss, torch.optim.SGD, and torch.optim.lr_scheduler.StepLR
- Evaluation by finding some nearest words and playing with word vectors
- Disappointing results (probably much more data required)

Others:
- CUDA support
- working on Colab with Google Drive for saving/loading interim stages

Sources used:
- https://github.com/FraLotito/pytorch-continuous-bag-of-words/blob/master/cbow.py

In [1]:
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running on {DEVICE}')

if IN_COLAB := 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  BASE_PATH = './drive/MyDrive/Colab/'
  import nltk
  nltk.download('stopwords')

else:
  BASE_PATH = './'

Running on cpu


## Preprocessing

In [14]:
import string
import re

import torchtext
import nltk
from nltk.corpus import stopwords

import locale
locale.setlocale(locale.LC_ALL, locale='de_DE.utf8')  # for thousands separator via ... print(f'{value:n}')

In [5]:
MIN_WORD_FREQUENCY = 150

### Load wikitext3 as DataPipe (Iterator) 

In [6]:
train_iter = torchtext.datasets.WikiText103(split='train')

### Create Vocab

In [11]:
# dependencies used for both building vocabulary and processing wikipedia texts in training
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
translator = str.maketrans('', '', string.punctuation + '“”’…')  # —

In [12]:
def custom_tokenizer_fn(row: str) -> list[str]:
    row_ = row.translate(translator)
    tokens = tokenizer(row_.lower())
    tokens_ = [token for token in tokens if token not in stop_words]
    tokens__ = [word for word in tokens_ if not re.findall(pattern='[^A-Za-z0-9.]+', string=word)]
    return tokens__

In [15]:
vocab = torchtext.vocab.build_vocab_from_iterator(
    iterator=map(custom_tokenizer_fn, train_iter),  # Must yield list or iterator of tokens
    min_freq=MIN_WORD_FREQUENCY
    )

# vocab = torch.load(BASE_PATH + 'vocab_wikitext3.pt')
torch.save(vocab, BASE_PATH + 'vocab_wikitext3.pt')
print(f'Vocabulary has {len(vocab)} tokens.')



Vocabulary has 25814 tokens.


### WikipediaProcessor

In [19]:
class WikipediaProcessor:
    def __init__(self,
                 vocab: torchtext.vocab.Vocab,
                 stop_words: set[str],
                 translator: dict,
                 device: str):
        self.vocab = vocab
        self.stop_words = stop_words
        self.translator = translator
        self.device = device

    def process(self, paragraphs: list[str]):

        contexts_as_indices = []

        for paragraph in paragraphs:
            tokenized = self._tokenize(paragraph)
            if not tokenized:
                continue
            contexts = self._create_contexts(tokenized)
            if not contexts:
                continue
            contexts_as_indices.extend(self._to_indices(contexts))

        contexts, targets = self._tensorize(contexts_as_indices)
        return contexts, targets

    def _tensorize(self, context_indices: list[tuple[tuple[int, int, int, int], int]]
                   ) -> tuple[torch.Tensor, torch.Tensor]:
        contexts = torch.Tensor(
            [context_ind for (context_ind, _) in context_indices]
        ).type(torch.long).to(self.device)

        targets = torch.Tensor([target_ind for (_, target_ind) in context_indices]).type(
            torch.long).to(self.device)
        assert len(contexts) == len(targets)
        return contexts, targets

    def _tokenize(self, paragraph: str) -> list[str]:
        without_punctuation = paragraph.translate(self.translator)
        tokenized_1 = tokenizer(without_punctuation.lower())
        tokenized_2 = [token for token in tokenized_1 if token not in self.stop_words]
        tokenized_3 = [word for word in tokenized_2 if
                       not re.findall(pattern='[^A-Za-z0-9.]+', string=word)]
        return [word for word in tokenized_3 if word in self.vocab]

    def _create_contexts(self, tokenized_paragraph: list[str]
                        ) -> list[tuple[tuple[str, str, str, str], str]]:
        contexts = []
        for i in range(2, len(tokenized_paragraph) - 2):
            context = (tokenized_paragraph[i - 2],
                       tokenized_paragraph[i - 1],
                       tokenized_paragraph[i + 1],
                       tokenized_paragraph[i + 2])
            target = tokenized_paragraph[i]
            contexts.append((context, target))
        return contexts

    def _to_indices(self, contexts: list[tuple[tuple[str, str, str, str], str]]
                   ) -> list[tuple[tuple[int, ...], int]]:
        context_indices = []
        for (context, target) in contexts:
            context_ind = tuple(self.vocab[c] for c in context)
            target_ind = self.vocab[target]
            context_indices.append((context_ind, target_ind))
        return context_indices


wikipedia_processor = WikipediaProcessor(vocab=vocab,
                                         stop_words=stop_words,
                                         translator=translator,
                                         device=DEVICE)

## DataLoader for wikitext3 DataPipe

In [33]:
from torch.utils.data import DataLoader

In [34]:
BATCH_SIZE = 8  # num of wikipedia paragraphs per training iteration

In [35]:
train_loader = DataLoader(
                   dataset=train_iter,
                   batch_size=BATCH_SIZE,
                   shuffle=True)

## Model

In [36]:
import torch.nn as nn

In [37]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim

        self.embeddings = nn.Embedding(num_embeddings=vocab_size,  # size of the dictionary of embeddings
                                       embedding_dim=embedding_dim)  # size of each embedding vector
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):  # inputs: [16, 4]
        embeds_ = self.embeddings(inputs)  # [16, 4, 100]
        embeds = torch.sum(embeds_, dim=1)  # [16, 100]
        # embeds = sum(self.embeddings(inputs)).view(1, -1)  # [1, 400]
        out = self.linear1(embeds)  # [16, 128]
        out = self.activation_function1(out)  # [16, 128]
        out = self.linear2(out)  # [16, 20420]
        out = self.activation_function2(out)  # [16, 20420]
        return out

    #def get_word_emdedding(self, word):
    #    word = torch.tensor([word_to_ix[word]])
    #    return self.embeddings(word).view(1,-1)

## Training

In [45]:
from tqdm.auto import tqdm
import math
import time
import numpy as np
import torch.optim.lr_scheduler as lr_scheduler

In [39]:
EMDEDDING_DIM = 300

N_EPOCHS = 1  # 50

UPDATE_LR_EVERY_N_EPOCHS = 2

DISPLAY_EVERY_N_STEPS = 30000

In [40]:
model = CBOW(len(vocab), EMDEDDING_DIM).to(DEVICE)

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=UPDATE_LR_EVERY_N_EPOCHS)

In [None]:
model.train()

total_steps_done = 0
TRAIN_SIZE = 1801350  # len(train_loader.dataset)  this DataPipe is buggy
total_steps = N_EPOCHS * TRAIN_SIZE
recent_losses = []
interim_time = time.time()

for epoch in range(N_EPOCHS):
    epoch_steps_done = 0

    progress_bar = tqdm(total=TRAIN_SIZE)

    # x_batch: [{batch_size}, 4]
    # y_batch: [{batch_size}]
    for batch_number, train_batch in enumerate(tqdm(train_loader)):

        x_train, targets = wikipedia_processor.process(train_batch)

        if len(x_train) == 0:
            continue
        
        optimizer.zero_grad()
        
        log_probs = model(x_train)  # [16, 20420]
        current_loss = loss_function(log_probs, targets)  # [16]

        recent_losses.append(current_loss.item())

        current_loss.backward()
        optimizer.step()

        total_steps_done += len(train_batch)
        epoch_steps_done += len(train_batch)
        progress_bar.update(epoch_steps_done)
        
        if (total_steps_done) % DISPLAY_EVERY_N_STEPS < len(train_batch):
            elapsed_time = time.time() - interim_time
            interim_time = time.time()
            average_loss = np.average(recent_losses)
            recent_losses = []
            print(f'| epoch {epoch + 1 :3d}/{N_EPOCHS} ',
                  f'| batch {batch_number + 1 :n}/{TRAIN_SIZE :n} ',
                  f'| {total_steps_done :n}/{total_steps :n} vectors done ',
                  f'| {elapsed_time :.2f} sec. ',
                  f'| lr {optimizer.param_groups[0]["lr"]}',
                  f'| loss {average_loss:5.2f}') #  :n
            
    scheduler.step()
    torch.save(model.state_dict(), BASE_PATH + 'saves/model_wikitext3.pt')
    progress_bar.close()
            

In [None]:
torch.save(model.state_dict(), BASE_PATH + 'saves/model_wikitext3.pt')

In [None]:
# load
#model = CBOW(len(vocab), EMDEDDING_DIM)
#model.load_state_dict(torch.load(BASE_PATH + 'saves/saves/model_wikitext3.pt'))
#model.eval()

## Evaluation

In [None]:
model.eval()

In [None]:
def get_index(*tokens) -> list[int] | int:
    indices = []
    for token in tokens:
        if token not in vocab:
            raise ValueError(f'Token not found: {token}')
        indices.append(vocab[token])
    return indices if len(indices) > 1 else indices[0]

get_index('king')

### Normalize Embeddings

In [None]:
# read embedding from first model layer
embeddings = next(model.embeddings.parameters()).cpu().detach().numpy()  # (16390, 100)


In [None]:
# normalize
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)  # ndarray (16390,)
norms = np.reshape(norms, (len(norms), 1))  # (16390, 1)
embeddings_normalized = embeddings / norms  # (16390, 100)

### Find Similar Words

In [None]:
def get_top_similar(word: str, top_n: int):
    if word not in vocab:
        raise ValueError(f'Not found: {word}')
    word_index = vocab[word]

    word_vector = embeddings_normalized[word_index]
    word_vector = np.reshape(word_vector, (len(word_vector), 1))
    distances = np.matmul(embeddings_normalized, word_vector).flatten()
    top_n_indices = np.argsort(-distances)[1 : top_n + 1]  # the nearest is always the word itself

    top_n_dict = {}
    for similar_word_index in top_n_indices:
        similar_word = vocab.lookup_token(similar_word_index)
        top_n_dict[similar_word] = distances[similar_word_index]
    return top_n_dict

In [None]:
for word, similarity in get_top_similar("king", top_n=10).items():
    print(f"{word: <20}: {similarity :.3f}")

### Vector Equations

In [None]:
emb1 = embeddings[vocab["king"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_normalized, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))