https://towardsdatascience.com/word2vec-with-pytorch-implementing-original-paper-2cd7040120b0

https://towardsdatascience.com/running-jupyter-notebook-on-the-cloud-in-15-mins-azure-79b7797e4ef6

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import WikiText2
import pandas as pd
from nltk.corpus import brown
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import sys
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import os
import random



In [2]:
def tensor_to_array(t):
    return np.array([i.item() for i in list(t)])

In [3]:
version = "april29_WT2_nodatalim_20epoch_64dim_100minf_4window"
vocab = torch.load("saves/vocab_april27_WT2_nodatalim_10epoch_128dim_100minf.pt")
len(vocab)

2156

In [4]:
from datasets import load_dataset
wikitext2 = load_dataset("wikitext", "wikitext-2-v1")
DATA_SPLIT = "train"
text = wikitext2[DATA_SPLIT]['text']
text = [item.lower().strip() for item in text if len(item) > 0]
len(text)
text = [item.split(" ") + ["\n"] for item in text if "=" not in item]

DATA_LIMIT = None #paragraph limit
all_words = []
for paragraph in text[:DATA_LIMIT]:
    all_words += paragraph
all_words = pd.Series(all_words)
# len(all_words)

print(sum(1 for i in all_words if i in vocab)/len(all_words))

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
SCANNING_WINDOW = 4
#maybe need to split into paragraphs b/c different topics...
#returns context, middle word
def get_data(index, window, data):
    return list(data[index-window:index])+list(data[index+1:index+window+1]), data[index]

Start here for training...

In [None]:
folder = "train_data/"
version1 = "_data_"
version2 = "_wt2_window4_100minf.pt"

In [None]:
x_test = torch.load(f"{folder}test{version1}x{version2}")

In [None]:
y_test = torch.load(f"{folder}test{version1}y{version2}")

In [None]:
x_train = torch.load(f"{folder}train{version1}x{version2}")
len(x_train) + len(x_test)

In [None]:
y_train = torch.load(f"{folder}train{version1}y{version2}")
len(y_train) + len(y_test)

In [None]:
from net import Net_CBOW

In [None]:
EMBED_DIMENSION = 128
net = Net_CBOW(len(vocab), EMBED_DIMENSION)

net.zero_grad()
criterion = nn.CrossEntropyLoss()
losses = []

In [None]:
NUM_EPOCHS = 20

optimizer = optim.Adam(net.parameters(), lr=0.025)
scheduler = optim.lr_scheduler.LinearLR(optimizer, 1.0, 0.0, total_iters=NUM_EPOCHS)

In [None]:
print("RUN       " + ("•••••••••|"*10))
indices = list(range(len(x_train)))
torch.save(net, f"saves/apr28epochs/model_{version}_init.pt")
for epoch in range(NUM_EPOCHS):
    print("RUN", str(epoch+1)+"/"+str(NUM_EPOCHS), end=": ")
    for i in range(len(x_train)):
        if i % (len(x_train)//100) == 0:
            print("•", end="")
        index = indices[i]
        context, target = x_train[index], y_train[index]
        optimizer.zero_grad()   # zero the gradient buffers
        output = net(torch.tensor(context))
        loss = criterion(output, torch.tensor(target))
        loss.backward()
        optimizer.step()    # Does the update

    for context, target in zip(x_test, y_test):
        output = net(torch.tensor(context))
        losses.append(criterion(output, torch.tensor(target)).item())
    print(scheduler.get_last_lr())

    scheduler.step()
    print()
    random.shuffle(indices)
    torch.save(net, f"saves/apr28epochs/model_{version}_epoch{str(epoch)}.pt")

In [None]:
# torch.save(net, f"saves/model_{version}.pt")
# torch.save(vocab, f"saves/vocab_{version}.pt")

#Note that 4/26 20epoch version got to a loss of 10

In [None]:
loss_per_epoch = []
for i in range(NUM_EPOCHS):
    loss_per_epoch += [sum(losses[(i)*len(x_test):(i+1)*len(x_test)])/len(x_test)]

In [None]:
plt.plot(loss_per_epoch)
plt.show()

In [None]:
plt.plot(losses)
plt.show()

In [None]:
# get first layer of the model
embeddings = list(net.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalize the embeddings layer
norms = (embeddings ** 2).sum(axis=1) ** (0.5)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

In [None]:
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_tsne = tsne.fit_transform(embeddings_df)
embeddings_df_tsne = pd.DataFrame(embeddings_df_tsne)

embeddings_df_tsne.index = vocab.keys()

In [None]:
numeric = embeddings_df_tsne.index.str.isnumeric()
color = np.where(numeric, "green", "gray")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_tsne[0],
        y=embeddings_df_tsne[1],
        mode="text",
        text=embeddings_df_tsne.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.write_html("word2vec_visualization.html")
fig

In [None]:
def lookup_id(word, vocab=vocab):
    if word not in vocab:
        return vocab["<unk>"]
    return vocab[word]

In [None]:
def lookup_token(word_id, vocab=vocab):
    for word in vocab:
        if vocab[word] == word_id:
            return word
    return None

In [None]:
def get_top_similar(word: str, topN: int = 10):
    if word not in vocab:
        print("Out of vocabulary word")
        return
    word_id = lookup_id(word)

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        # sim_word = vocab.lookup_token(sim_word_id)
        sim_word = "<unk_>"
        for k in vocab:
            if vocab[k] == sim_word_id:
                sim_word = k
                break
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [None]:
for word, sim in get_top_similar("1").items():
    print("{}: {:.3f}".format(word, sim))

In [None]:
emb1 = embeddings[vocab["father"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["female"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(lookup_token(word_id), dists[word_id]))