In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd
from pathlib import Path
from functools import partial
from sklearn.manifold import TSNE
import plotly.graph_objects as go

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from torchtext.data import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import WikiText2

torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
EPOCHS = 5
CONTEXT_SIZE = 2
BATCH_SIZE = 64
SHUFFLE = False
MAX_VOCAB_TOKENS = 5000
MIN_TOKEN_FREQ = 50
EMBED_DIM = 500

MAX_SEQUENCE_LENGTH = 256
CBOW_N_WORDS = 4

CWD = os.getcwd()

In [3]:
def build_dataiter(batch_size:int, shuffle:bool, split:str):
    data_iter = WikiText2(root=CWD, split=(split))
    data_iter = to_map_style_dataset(data_iter)
    return data_iter

train_iter = build_dataiter(BATCH_SIZE, SHUFFLE, split='train')
val_iter = build_dataiter(BATCH_SIZE, SHUFFLE, split='valid')
print('Length of train dataset:', len(train_iter))
print('Length of validation dataset:', len(val_iter))
print('Example data:', train_iter[42])

Length of train dataset: 36718
Length of validation dataset: 3760
Example data:  Kurt and Riela were featured in the Nintendo 3DS crossover Project X Zone , representing the Valkyria series . Media.Vision would return to the series to develop Valkyria : Azure Revolution , with Ozawa returning as director . Azure Revolution is a role @-@ playing video game for the PlayStation 4 that forms the beginning of a new series within the Valkyria franchise . 



In [4]:
tokenizer = get_tokenizer('basic_english')
unk = '<unk>'

def build_vocab(vocab=None):
    if not vocab:
        vocab = build_vocab_from_iterator(
            iterator=map(tokenizer, train_iter),
            min_freq=MIN_TOKEN_FREQ,
            specials=[unk],
            max_tokens=MAX_VOCAB_TOKENS
        )
        vocab.set_default_index(vocab[unk])
    return vocab

vocab = build_vocab()

In [5]:
text_pipeline = lambda x: vocab(tokenizer(x))

def collate_fn(batch):
    contexts, targets = [], []
    for text in batch:
        text_token_ids = text_pipeline(text)
    
        for i in range(CONTEXT_SIZE, len(text_token_ids) - CONTEXT_SIZE):
            context = ([text_token_ids[i - j - 1] for j in range(CONTEXT_SIZE)] + 
                       [text_token_ids[i + j + 1] for j in range(CONTEXT_SIZE)])
            target = text_token_ids[i]
            contexts.append(context)
            targets.append(target)
            
    contexts = torch.tensor(contexts, dtype=torch.long)
    targets = torch.tensor(targets, dtype=torch.long)
    return contexts, targets

train_loader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=SHUFFLE, collate_fn=partial(collate_fn))
val_loader = DataLoader(val_iter, batch_size=BATCH_SIZE, shuffle=SHUFFLE, collate_fn=partial(collate_fn))

In [6]:
vocab_size = len(vocab.get_stoi())
print(vocab_size, 'elements in vocab')

4099 elements in vocab


In [12]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings=nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIM,
            )
        self.linear1 = nn.Linear(in_features=EMBED_DIM, out_features=vocab_size)
        self.linear2 = nn.Linear(in_features=vocab_size, out_features=vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        x = embeds.mean(axis=1)
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x

model = CBOW(vocab_size=vocab_size, embedding_dim=EMBED_DIM)
model.to(device)

CBOW(
  (embeddings): Embedding(4099, 500)
  (linear1): Linear(in_features=500, out_features=4099, bias=True)
  (linear2): Linear(in_features=4099, out_features=4099, bias=True)
)

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_losses = []
val_losses = []

def train_one_epoch():
    model.train()
    running_loss = 0
    last_loss = 0
    
    for i, data in enumerate(train_loader):
        inputs, labels = data[0].to(device), data[1].to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
            
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if i % 100 == 99:
            last_loss = running_loss / 100 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            running_loss = 0.
            
    train_losses.append(last_loss)
            
    return last_loss

def val_one_epoch(train_loss):
    model.eval()
    running_loss = 0
    
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() 
            
    val_loss = running_loss / (i + 1)
    print('LOSS train {} valid {}'.format(train_loss, val_loss))
    
    val_losses.append(val_loss)

In [14]:
loss = dict()
loss['train loss'] = train_losses
loss['validatoin loss'] = val_losses

def save_model():
    model_path = os.path.join(CWD, 'model.pt')
    torch.save(model, model_path)

def save_loss():
    loss_path = os.path.join(CWD, 'loss.json')
    with open(loss_path, 'w') as fp:
        json.dump(loss, fp)

def save_vocab(vocab):
    vocab_path = os.path.join(CWD + 'vocab.pt')
    torch.save(vocab, vocab_path)

In [19]:
for epoch in range(EPOCHS):
    print('Epoch number:', epoch)
    train_loss = train_one_epoch()
    val_one_epoch(train_loss)
print('Training Complete')

save_model()
save_loss()
save_vocab(vocab)
print('Artifacts Saved')

Epoch number: 0
  batch 100 loss: 0.8124179857969284
  batch 200 loss: 0.8061182859539986
  batch 300 loss: 0.8351352351903916
  batch 400 loss: 0.8517639979720115
  batch 500 loss: 0.8538113594055176
LOSS train 0.8538113594055176 valid 14.720406790911142
Epoch number: 1
  batch 100 loss: 0.8169279763102532
  batch 200 loss: 0.8176163744926452
  batch 300 loss: 0.8391732397675514
  batch 400 loss: 0.8655244570970535
  batch 500 loss: 0.8575838950276374
LOSS train 0.8575838950276374 valid 14.836995674391924
Epoch number: 2
  batch 100 loss: 0.8210721081495285
  batch 200 loss: 0.8169447623193264
  batch 300 loss: 0.8471569016575813
  batch 400 loss: 0.8654600831866265
  batch 500 loss: 0.8634988290071487
LOSS train 0.8634988290071487 valid 14.987629793458066
Epoch number: 3
  batch 100 loss: 0.8193483644723892
  batch 200 loss: 0.820979431271553
  batch 300 loss: 0.851840243935585
  batch 400 loss: 0.8695668342709542
  batch 500 loss: 0.8688784703612328
LOSS train 0.8688784703612328 val

In [17]:
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

# get embeddings
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.get_itos()

# if token is a number
is_numeric = embeddings_df_trans.index.str.isnumeric()

color = np.where(is_numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.show()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.

