# Module 1 Project 2: Word2Vec

Implement [Word2Vec](https://en.wikipedia.org/wiki/Word2vec) and play around with it

## STEP 1: IMPORT THE NECESSARY LIBRARIES
- We need a lot of stuff from `torch` for this project
- Also the usual `numpy` imports

In [None]:
import argparse
import yaml
import os
import numpy as np
import json
import torch
import torch.nn as nn
from functools import partial
from torch.utils.data import DataLoader, Dataset
from torchtext.data import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import WikiText2, WikiText103
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

## STEP 2: CONSTANTS AND HELPER METHODS
- Here we set our constants for the CBOW and SkipGram models
- We set the minimum word frequency for our vocab and maximum sequence length for collation (combination)
- Collation methods below just prepare the text data in the appropriate format depending on our model choice

In [None]:
CBOW_N_WORDS = 4
SKIPGRAM_N_WORDS = 4

MIN_WORD_FREQUENCY = 50
MAX_SEQUENCE_LENGTH = 256

EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

def collate_cbow(batch, text_pipeline):
    batch_input, batch_output = [], []
    for text in batch:
        text_tokens_ids = text_pipeline(text)

        if len(text_tokens_ids) < CBOW_N_WORDS * 2 + 1:
            continue

        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]

        for idx in range(len(text_tokens_ids) - CBOW_N_WORDS * 2):
            token_id_sequence = text_tokens_ids[idx : (idx + CBOW_N_WORDS * 2 + 1)]
            output = token_id_sequence.pop(CBOW_N_WORDS)
            input_ = token_id_sequence
            batch_input.append(input_)
            batch_output.append(output)

    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)
    return batch_input, batch_output

def collate_skipgram(batch, text_pipeline):
    batch_input, batch_output = [], []
    for text in batch:
        text_tokens_ids = text_pipeline(text)

        if len(text_tokens_ids) < SKIPGRAM_N_WORDS * 2 + 1:
            continue

        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]

        for idx in range(len(text_tokens_ids) - SKIPGRAM_N_WORDS * 2):
            token_id_sequence = text_tokens_ids[idx : (idx + SKIPGRAM_N_WORDS * 2 + 1)]
            input_ = token_id_sequence.pop(SKIPGRAM_N_WORDS)
            outputs = token_id_sequence

            for output in outputs:
                batch_input.append(input_)
                batch_output.append(output)

    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)
    return batch_input, batch_output

## STEP 3: BUILD OUR DATASET AND DATALOADER
- Using the provided `data.txt` file ([Norm MacDonald Wikipedia entry snippets](https://en.wikipedia.org/wiki/Norm_Macdonald)), we can create a Dataset class to iterate over the text in the file
- Choosing our model type as CBOW by default
- Using torch's basic English tokenizer to make life easy
- We build our vocab using our tokenizer and our dataset
- We will then build a DataLoader with a batch size of 96

In [None]:
class NormDataset(torch.utils.data.Dataset):
    def __init__(self) -> None:
        self.source = open('./data.txt', encoding='utf-8').read().split('\n')

    def __getitem__(self, idx) -> torch.Tensor:
        source_sample = self.source[idx]
        return source_sample

    def __len__(self):
        return len(self.source)

model_name = "cbow"

tokenizer = get_tokenizer("basic_english", language="en")

dataset = NormDataset()

vocab = build_vocab_from_iterator(
    map(tokenizer, dataset),
    specials=["<unk>"],
    min_freq=MIN_WORD_FREQUENCY,
)
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(tokenizer(x))

if model_name == "cbow":
    collate_fn = collate_cbow
elif model_name == "skipgram":
    collate_fn = collate_skipgram
else:
    raise ValueError("Choose model from: cbow, skipgram")

dataloader = DataLoader(
    dataset,
    batch_size=96,
    shuffle=True,
    collate_fn=partial(collate_fn, text_pipeline=text_pipeline),
)

## STEP 4: BUILD OUR MODELS
- Here we define our two models, CBOW and SkipGram
- CBOW uses all context words arround a 'middle term' to predict the term in context
- SkipGram is only given one word as context, and asked to predict the next word in the sentence
- Therefore, we rewuire different models because the inputs are shaped differently

In [None]:
class CBOW_Model(nn.Module):
    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x

class SkipGram_Model(nn.Module):
    def __init__(self, vocab_size: int):
        super(SkipGram_Model, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )

    def forward(self, inputs_):
        x = self.embeddings(inputs_)
        x = self.linear(x)
        return x

## STEP 5: BUILD OUR TRAINER CLASS
- We don't expressly need this, but it makes life easy in the final step to train this model
- The below code contains the classic training code for torch models, nothing really fancy going on here

In [None]:
class Trainer:
    def __init__(
        self,
        model,
        epochs,
        train_dataloader,
        train_steps,
        val_dataloader,
        val_steps,
        criterion,
        optimizer,
        lr_scheduler,
        device,
    ):  
        self.model = model
        self.epochs = epochs
        self.train_dataloader = train_dataloader
        self.train_steps = train_steps
        self.val_dataloader = val_dataloader
        self.val_steps = val_steps
        self.criterion = criterion
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
        self.device = device

        self.loss = {"train": [], "val": []}
        self.model.to(self.device)

    def train(self):
        for epoch in range(self.epochs):
            self._train_epoch()
            self._validate_epoch()
            print(
                "Epoch: {}/{}, Train Loss={:.5f}, Val Loss={:.5f}".format(
                    epoch + 1,
                    self.epochs,
                    self.loss["train"][-1],
                    self.loss["val"][-1],
                )
            )

            self.lr_scheduler.step()

    def _train_epoch(self):
        self.model.train()
        running_loss = []

        for i, batch_data in enumerate(self.train_dataloader, 1):
            inputs = batch_data[0].to(self.device)
            labels = batch_data[1].to(self.device)

            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()

            running_loss.append(loss.item())

            if i == self.train_steps:
                break

        epoch_loss = np.mean(running_loss)
        self.loss["train"].append(epoch_loss)

    def _validate_epoch(self):
        self.model.eval()
        running_loss = []

        with torch.no_grad():
            for i, batch_data in enumerate(self.val_dataloader, 1):
                inputs = batch_data[0].to(self.device)
                labels = batch_data[1].to(self.device)

                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)

                running_loss.append(loss.item())

                if i == self.val_steps:
                    break

        epoch_loss = np.mean(running_loss)
        self.loss["val"].append(epoch_loss)

## STEP 6: TRAIN THE MODEL
- Now we can finally train Word2Vec from scratch
- We set the model type and initialize, as well as initializing the optimizer and the learning rate scheduler
- We chose 100 epochs and a LR of 0.025 arbitrarily for this example, as well as the train and validation steps

In [None]:
vocab_size = len(vocab.get_stoi())
print(f"Vocabulary size: {vocab_size}")

model_class = CBOW_Model
model = model_class(vocab_size=vocab_size)
criterion = nn.CrossEntropyLoss()

optimizer_class = optim.Adam
optimizer = optimizer_class(model.parameters(), lr=0.025)

epochs = 100

lr_lambda = lambda epoch: (epochs - epoch) / epochs
lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda, verbose=True)

device = torch.device("cpu")

trainer = Trainer(
    model=model,
    epochs=100,
    train_dataloader=dataloader,
    train_steps=10,
    val_dataloader=dataloader,
    val_steps=10,
    criterion=criterion,
    optimizer=optimizer,
    lr_scheduler=lr_scheduler,
    device=device,
)

trainer.train()
print("Training finished.")