# PyTorch🔥Embeddings - Build Word2vec CBOW Model (wikitext2 Dataset)

Subject: Building a Word2vec-like CBOW Model to create embeddings for the dataset's words representing their near closeness to each other (in 100 embedding dimensions).

Data: WikiText-2 via torchtext 

Procedure:
- Tokenizing with torchtext.data.utils.get_tokenizer and nltk.corpus.stopwords
- Creating contexts and targets from five words each: (01 34) with (2) as target
- Tensorizing contexts and targets
- Creating a vocabulary with torchtext.vocab.vocab
- Creating a custom torch.utils.data.Dataset for a torch.utils.data.DataLoader
- Word2vec-like CBOW model with torch.nn.module, torch.nn.Embedding, torch.nn.Linear, torch.nn.ReLU,  and torch.nn.LogSoftmax
- Training with torch.nn.NLLLoss, torch.optim.SGD, and torch.optim.lr_scheduler.StepLR
- Evaluation by finding some nearest words and playing with word vectors
- Disappointing results (probably much more data required)

Others:
- CUDA support
- working on Colab with Google Drive for saving/loading interim stages

Sources used:
- https://github.com/FraLotito/pytorch-continuous-bag-of-words/blob/master/cbow.py

In [1]:
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running on {DEVICE}')

if IN_COLAB := 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  BASE_PATH = './drive/MyDrive/Colab/'
  import nltk
  nltk.download('stopwords')
  !pip install portalocker

else:
  BASE_PATH = './'

Running on cpu


## Dataset

In [2]:
import pandas as pd
import torch.nn as nn
import torchtext
import string
import numpy as np
from collections import Counter
import locale
import re
locale.setlocale(locale.LC_ALL, locale='')  # for thousands separator via ... print(f'{value:n}')

from nltk.corpus import stopwords


In [4]:
train_iter = torchtext.datasets.WikiText2(split='train')
train_list = list(train_iter)  # 36.718 lines, some only line break, some multiple sentences
#train_list[2000:2010]

In [5]:
# flatten the list to s single, huge string
train_str = ' '.join(train_list)
print(len(train_str))

10817154


In [6]:
# remove punctuation
translator = str.maketrans('', '', string.punctuation + '“”’…')  # —
train_without_punctuation =  train_str.translate(translator)

print(f"Removed {len(train_str) - len(train_without_punctuation) :n} punctuation characters.",
      f"Remaining characters: {len(train_without_punctuation) :n}.")

Removed 472.889 punctuation characters. Remaining characters: 10.344.265.


In [7]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

tokenized_1 = tokenizer(train_without_punctuation.lower())
print(f'Tokenized words: {len(tokenized_1) :n}.')

Tokenized words: 1.755.612.


In [8]:
# Remove most common words
stop_words = set(stopwords.words('english'))

tokenized_2 = [token for token in tokenized_1 if token not in stop_words]

print(f'Tokenized words after removing stopwords: {len(tokenized_2) :n}.')

Tokenized words after removing stopwords: 1.044.255.


In [9]:
# Remove words with special characters, e.g. 'élégante', 'wicked–base–ever'
tokenized_3 = [word for word in tokenized_2 if not (special := re.findall(pattern='[^A-Za-z0-9.]+', string=word))]

print(f'Tokenized words after removing words with special characters: {len(tokenized_3) :n}.')

Tokenized words after removing words with special characters: 1.034.809.


In [10]:
# Remove words that have a word occurrence below the threshold
words_counter = Counter(tokenized_3)
MIN_WORD_OCCURRENCES = 5

tokenized_4 = [word for word in tokenized_3 if words_counter[word] >= MIN_WORD_OCCURRENCES]

print(f'Tokenized words after removing rare words: {len(tokenized_4) :n}.')

Tokenized words after removing rare words: 1.007.064.


### Create Context Words

In [11]:
contexts: list[tuple[tuple[str, str, str, str], str]] = []

for i in range(2, len(tokenized_4) - 2):
    context = (tokenized_4[i - 2], 
               tokenized_4[i - 1],
               tokenized_4[i + 1], 
               tokenized_4[i + 2])
    target = tokenized_4[i]
    contexts.append((context, target))

print(f"Collected context training data of size {len(contexts) :n} with 4 context words and a target word each. ")

Collected context training data of size 1.007.060 with 4 context words and a target word each. 


### Words to Index

Unlike gensim's Words2vec, torch always requires indices instead of strings. 

In [12]:
# vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_4)
distinct_words = set(tokenized_4)
print(len(distinct_words))
token_to_index = {word: i+1 for i, word in enumerate(distinct_words)}  # vocab seems to have a problem with index 0
vocab = torchtext.vocab.vocab(token_to_index)

print(f'The {type(vocab)} has indices for a total of {len(vocab) :n} different words.')

20053
The <class 'torchtext.vocab.vocab.Vocab'> has indices for a total of 20.053 different words.


In [13]:
# convert our context training data's contents to indices
context_indices: list[tuple[tuple[int, int, int, int], int]] = []
for (context, target) in contexts:
    context_ind = [vocab[c] for c in context]
    target_ind = vocab[target]
    context_indices.append((context_ind, target_ind))
    
print(f'Example: {contexts[2]} -> {context_indices[2]}')

Example: (('iii', 'valkyria', 'unk', 'chronicles'), '3') -> ([4389, 8705, 12728, 6671], 2191)


### Tensorize

Finally, we need to tensorize our indices and targets.
- Shape [185033, 4] for context words
- Shape [185033] for target, i.e. value only

In [14]:
contexts = torch.Tensor([context_ind for (context_ind, _) in context_indices]).type(torch.long)

In [15]:
targets =  torch.Tensor([target_ind for (_, target_ind) in context_indices]).type(torch.long)

assert len(contexts) == len(targets)

## Dataset and DataLoader

In [16]:
from torch.utils.data import DataLoader, Dataset

### Dataset
Create a Torch Dataset

In [17]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        assert len(x) == len(y)
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [18]:
dataset = CustomDataset(contexts.to(DEVICE), targets.to(DEVICE))

## Save and Load
Preprocessing takes a long time when including all novels.

In [19]:
DATASET_CONTEXTS = BASE_PATH + 'saves/contexts_wiki2.pt'
DATASET_TARGETS = BASE_PATH + 'saves/targets_wiki2.pt'
VOCAB_PATH = BASE_PATH + 'saves/vocab_wiki2.pt'

In [20]:
torch.save(vocab, VOCAB_PATH)

In [21]:
torch.save(contexts, DATASET_CONTEXTS)
torch.save(targets, DATASET_TARGETS)

In [22]:
#vocab = torch.load(VOCAB_PATH)
#print(f'Loaded vocab of size {len(vocab) :n}.')

In [23]:
# we save & load not the dataset but x and y to make transfer to device easier
#contexts = torch.load(DATASET_CONTEXTS).to(DEVICE)
#targets = torch.load(DATASET_TARGETS).to(DEVICE)
#dataset = CustomDataset(contexts, targets)
#print(f'Loaded {len(dataset) :n} context tensors as training data.')

### DataLoader

In [24]:
BATCH_SIZE = 512

In [25]:
train_loader = DataLoader(dataset, 
                          batch_size=BATCH_SIZE, 
                          shuffle=True)

## Model

In [26]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim

        self.embeddings = nn.Embedding(num_embeddings=vocab_size,  # size of the dictionary of embeddings
                                       embedding_dim=embedding_dim)  # size of each embedding vector
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):  # inputs: [16, 4]
        embeds_ = self.embeddings(inputs)  # [16, 4, 100]
        embeds = torch.sum(embeds_, dim=1)  # [16, 100]
        # embeds = sum(self.embeddings(inputs)).view(1, -1)  # [1, 400]
        out = self.linear1(embeds)  # [16, 128]
        out = self.activation_function1(out)  # [16, 128]
        out = self.linear2(out)  # [16, 20420]
        out = self.activation_function2(out)  # [16, 20420]
        return out

    #def get_word_emdedding(self, word):
    #    word = torch.tensor([word_to_ix[word]])
    #    return self.embeddings(word).view(1,-1)

## Training

In [27]:
from tqdm.auto import tqdm
import math
import time
import torch.optim.lr_scheduler as lr_scheduler

In [28]:
EMDEDDING_DIM = 100

N_EPOCHS = 1  # 50

DISPLAY_EVERY_N_STEPS = 1000000

In [29]:
model = CBOW(len(vocab), EMDEDDING_DIM).to(DEVICE)

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30)

In [None]:
print(f'Starting training with N_EPOCHS = {N_EPOCHS} and a training data of {len(contexts) :n} context tensors.')

model.train()


steps_done = 0
total_steps = N_EPOCHS * len(train_loader.dataset)
recent_losses = []
interim_time = time.time()
for epoch in range(N_EPOCHS):

    # x_batch: [{batch_size}, 4]
    # y_batch: [{batch_size}]
    for batch, (x_batch, y_batch) in enumerate(tqdm(train_loader)):
            
        optimizer.zero_grad()
        
        log_probs = model(x_batch)  # [16, 20420]
        current_loss = loss_function(log_probs, y_batch)  # [16]

        recent_losses.append(current_loss.item())

        current_loss.backward()
        optimizer.step()

        steps_done += len(x_batch)
        
        if (steps_done) % DISPLAY_EVERY_N_STEPS < len(x_batch):
            elapsed_time = time.time() - interim_time
            interim_time = time.time()
            average_loss = np.average(recent_losses)
            recent_losses = []
            print(f'| epoch {epoch + 1 :3d}/{N_EPOCHS} ',
                  f'| batch {batch + 1 :n}/{len(train_loader) :n} ',
                  f'| {steps_done :n}/{total_steps :n} vectors done ',
                  f'| {elapsed_time :.2f} sec. ',
                  f'| lr {optimizer.param_groups[0]["lr"]}',
                  f'| loss {average_loss:5.2f}') #  :n
            
    scheduler.step()
            

In [31]:
# save trained model
MODEL_PATH = BASE_PATH + 'saves/model.pt'

In [32]:
torch.save(model.state_dict(), MODEL_PATH)

In [33]:
# load
#model = CBOW(len(vocab), EMDEDDING_DIM)
#model.load_state_dict(torch.load(MODEL_PATH))
#model.eval()

## Evaluation

In [34]:
model.eval()

CBOW(
  (embeddings): Embedding(20053, 100)
  (linear1): Linear(in_features=100, out_features=128, bias=True)
  (activation_function1): ReLU()
  (linear2): Linear(in_features=128, out_features=20053, bias=True)
  (activation_function2): LogSoftmax(dim=-1)
)

In [35]:
def get_index(*tokens) -> list[int] | int:
    indices = []
    for token in tokens:
        if token not in vocab:
            raise ValueError(f'Token not found: {token}')
        indices.append(vocab[token])
    return indices if len(indices) > 1 else indices[0]

get_index('king')

15072

### Normalize Embeddings

In [36]:
# read embedding from first model layer
embeddings = next(model.embeddings.parameters()).cpu().detach().numpy()  # (16390, 100)


In [37]:
# normalize
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)  # ndarray (16390,)
norms = np.reshape(norms, (len(norms), 1))  # (16390, 1)
embeddings_normalized = embeddings / norms  # (16390, 100)

### Find Similar Words

In [38]:
def get_top_similar(word: str, top_n: int):
    if word not in vocab:
        raise ValueError(f'Not found: {word}')
    word_index = vocab[word]

    word_vector = embeddings_normalized[word_index]
    word_vector = np.reshape(word_vector, (len(word_vector), 1))
    distances = np.matmul(embeddings_normalized, word_vector).flatten()
    top_n_indices = np.argsort(-distances)[1 : top_n + 1]  # the nearest is always the word itself

    top_n_dict = {}
    for similar_word_index in top_n_indices:
        similar_word = vocab.lookup_token(similar_word_index)
        top_n_dict[similar_word] = distances[similar_word_index]
    return top_n_dict

In [39]:
for word, similarity in get_top_similar("king", top_n=10).items():
    print(f"{word: <20}: {similarity :.3f}")

berdych             : 0.393
wiley               : 0.373
rule                : 0.352
proponents          : 0.340
primitive           : 0.339
trophy              : 0.330
stimulation         : 0.326
unprecedented       : 0.320
detachment          : 0.319
californian         : 0.317


### Vector Equations

In [40]:
emb1 = embeddings[vocab["king"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_normalized, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

woman: 0.623
king: 0.614
thermal: 0.390
californian: 0.359
doubts: 0.359
