In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import WikiText2
import pandas as pd
from nltk.corpus import brown
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import sys
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import os

## Analyze

In [2]:
from net import Net_CBOW

In [71]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epoch_n = 19
net = torch.load(f"saves/apr27epochs/model_april27_WT2_nodatalim_10epoch_128dim_100minf_epoch{epoch_n}.pt", map_location=device)
vocab = torch.load(f"saves/vocab_april27_WT2_nodatalim_10epoch_128dim_100minf.pt")

In [72]:
# get first layer of the model
embeddings = list(net.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalize the embeddings layer
norms = (embeddings ** 2).sum(axis=1) ** (0.5)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_tsne = tsne.fit_transform(embeddings_df)
embeddings_df_tsne = pd.DataFrame(embeddings_df_tsne)

embeddings_df_tsne.index = vocab.keys()


In [82]:
numeric = embeddings_df_tsne.index.str.isnumeric()
color = np.where(numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_tsne[0],
        y=embeddings_df_tsne[1],
        mode="text",
        text=embeddings_df_tsne.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
# fig.write_html("word2vec_visualization.html")
fig


In [74]:
def lookup_id(word, vocab=vocab):
    if word not in vocab:
        return vocab["<unk>"]
    return vocab[word]
def lookup_token(word_id, vocab=vocab):
    for word in vocab:
        if vocab[word] == word_id:
            return word
    return None
def get_top_similar(word: str, topN: int = 10):
    if word not in vocab:
        print("Out of vocabulary word")
        return
    word_id = lookup_id(word)

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        # sim_word = vocab.lookup_token(sim_word_id)
        sim_word = "<unk_>"
        for k in vocab:
            if vocab[k] == sim_word_id:
                sim_word = k
                break
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [89]:
for word, sim in get_top_similar("things").items():
    print("{}: {:.3f}".format(word, sim))


crazy: 0.806
band: 0.792
love: 0.787
what: 0.786
let: 0.780
ai: 0.778
've: 0.770
record: 0.770
go: 0.764
change: 0.763


In [90]:
emb1 = embeddings[vocab["men"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["thing"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(lookup_token(word_id), dists[word_id]))

thing: 0.904
men: 0.894
sports: 0.873
started: 0.865
time: 0.862


In [77]:
folder = "train_data/"
version1 = "_data_"
version2 = "_wt2_window4.pt"

In [78]:
x_test = torch.load(f"{folder}test{version1}x{version2}")
y_test = torch.load(f"{folder}test{version1}y{version2}")

In [79]:
x_test[0]

[629, 4, 22, 29, 108, 1, 392, 5]

In [80]:
lookup_token(y_test[1])

'film'

In [81]:
[lookup_token(i) for i in x_test[3]]

['an', 'english', 'film', ',', 'and', 'theatre', 'actor', '.']