Notebook for analyzing results of the models.

In [65]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import WikiText2
import pandas as pd
from nltk.corpus import brown
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import sys
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import os

## CBOW

In [66]:
from net import Net_CBOW

In [67]:
net = torch.load(f"saves/model_april22_3000datalim_20epoch.pt")
vocab = torch.load(f"saves/vocab_april22_3000datalim_20epoch.pt")

In [68]:
# get first layer of the model
embeddings = list(net.parameters())[0]
# embeddings = torch.load("saves/emb_april22_3000datalim_20epoch.pt")
# vocab = torch.load("saves/vocab_april22_3000datalim_20epoch.pt")
embeddings = embeddings.cpu().detach().numpy()

# normalize the embeddings layer
norms = (embeddings ** 2).sum(axis=1) ** (0.5)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_tsne = tsne.fit_transform(embeddings_df)
embeddings_df_tsne = pd.DataFrame(embeddings_df_tsne)

embeddings_df_tsne.index = vocab.keys()


In [69]:
numeric = embeddings_df_tsne.index.str.isnumeric()
color = np.where(numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_tsne[0],
        y=embeddings_df_tsne[1],
        mode="text",
        text=embeddings_df_tsne.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
# fig.write_html("word2vec_visualization.html")
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [70]:
def lookup_id(word, vocab=vocab):
    if word not in vocab:
        return vocab["<unk>"]
    return vocab[word]
def lookup_token(word_id, vocab=vocab):
    for word in vocab:
        if vocab[word] == word_id:
            return word
    return None
def get_top_similar(word: str, topN: int = 10):
    if word not in vocab:
        print("Out of vocabulary word")
        return
    word_id = lookup_id(word)

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        # sim_word = vocab.lookup_token(sim_word_id)
        sim_word = "<unk_>"
        for k in vocab:
            if vocab[k] == sim_word_id:
                sim_word = k
                break
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [71]:
for word, sim in get_top_similar("1").items():
    print("{}: {:.3f}".format(word, sim))


0: 0.847
11: 0.823
45: 0.813
8: 0.812
2: 0.806
4: 0.799
6: 0.790
5: 0.780
cm: 0.766
7: 0.766


## Transformer Model

In [72]:
from transformer import TransformerModel
transformer = torch.load("saves/model_transformer_may5_0100am.pt")
vocab = torch.load(f"saves/vocab_may1_WT2_transformer_min25f.pt")

In [73]:
# get first layer of the model
embeddings = list(transformer.input_emb.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalize the embeddings layer
norms = (embeddings ** 2).sum(axis=1) ** (0.5)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape
embeddings_df = pd.DataFrame(embeddings)

In [74]:
# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_tsne = tsne.fit_transform(embeddings_df)
embeddings_df_tsne = pd.DataFrame(embeddings_df_tsne)

embeddings_df_tsne.index = vocab.keys()

In [75]:
numeric = embeddings_df_tsne.index.str.isnumeric()
color = np.where(numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_tsne[0],
        y=embeddings_df_tsne[1],
        mode="text",
        text=embeddings_df_tsne.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
# fig.write_html("word2vec_visualization.html")
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [76]:
def lookup_id(word, vocab=vocab):
    word = word.lower()
    if word not in vocab:
        return vocab["<unk>"]
    return vocab[word]

def lookup_token(word_id, vocab=vocab):
    for word in vocab:
        if vocab[word] == word_id:
            return word
    return "<unk>"

def normalize(emb):
    emb_norm = (emb ** 2).sum() ** (1 / 2)
    return emb / emb_norm

def embed(word):
    return torch.tensor(embeddings[lookup_id(word)])

def analogy(worda, wordA, wordb, n=5, include_inputs = False):
    vocab[worda], vocab[wordA], vocab[wordb]
    emba = embed(worda)
    embA = embed(wordA)
    embb = embed(wordb)

    embB = embA - emba + embb
    embB = normalize(embB)

    embB = np.reshape(embB, (len(embB), 1))
    dists = np.matmul(embeddings_norm, embB).flatten()

    topn = np.argsort(-dists)[:n+3]
    index = 0
    count = 0
    out = []
    print(worda, "is to", wordA, "as", wordb, "is to: ")
    while count < n:
        word_id = topn[index]
        if include_inputs or (lookup_token(word_id) not in [worda, wordA, wordb]):
            out.append((lookup_token(word_id), dists[word_id]))
            print("    ", "{}: {:.3f}".format(lookup_token(word_id), dists[word_id]))
            count += 1
        index += 1
    print("----------------")

In [77]:
def closest_word(embedding, n = 1):
    emb = normalize(embedding)

    emb = np.reshape(emb, (len(emb), 1))
    dists = np.matmul(embeddings_norm, emb).flatten()

    topn = np.argsort(-dists)[:n]
    return [lookup_token(top) for top in topn], [dists[top] for top in topn]

def mathify(word):
    return (word, embed(word))
def multiply(word: tuple, factor):
    a = word[1]*factor
    return (a, closest_word(a,3)[0])
def add(worda: tuple, wordb: tuple):
    a = worda[1]+wordb[1]
    return (a, closest_word(a,3)[0])

In [78]:
analogy("mother", "woman", "father", include_inputs=False) #man
analogy("kingdom", "king", "empire", include_inputs=False) #emperor
analogy("2001", "1", "2002", include_inputs=False) #2


mother is to woman as father is to: 
     man: 0.389
     soldier: 0.350
     someone: 0.341
     girl: 0.340
     character: 0.332
----------------
kingdom is to king as empire is to: 
     emperor: 0.285
     composer: 0.274
     doctor: 0.269
     prince: 0.262
     actor: 0.250
----------------
2001 is to 1 as 2002 is to: 
     2: 0.265
     3: 0.257
     7: 0.249
     returned: 0.236
     13: 0.228
----------------


In [79]:
analogy("2001", "2002", "2005", include_inputs=False) #2006
analogy("1", "3", "4", include_inputs=False) #6
analogy("bright", "yellow", "dark", include_inputs=False) #brown

2001 is to 2002 as 2005 is to: 
     2006: 0.454
     1992: 0.423
     1998: 0.415
     1999: 0.415
     2007: 0.415
----------------
1 is to 3 as 4 is to: 
     6: 0.416
     2: 0.405
     5: 0.380
     8: 0.370
     7: 0.329
----------------
bright is to yellow as dark is to: 
     blue: 0.276
     natural: 0.251
     green: 0.243
     human: 0.238
     romantic: 0.234
----------------


In [80]:
analogy("bright", "dark", "cold", include_inputs=False) #hot
print(multiply(mathify("cold"),2)[1]) #bright

bright is to dark as cold is to: 


     thick: 0.266
     civil: 0.245
     revolutionary: 0.230
     competitive: 0.227
     aggressive: 0.218
----------------
['cold', 'warm', 'civil']


In [81]:
#Text generation testing
device = torch.device("cpu")
ntokens = len(vocab)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
model = transformer
temperature = 2

input = 'he claimed in an interview that '
print(input, end='')
input = [lookup_id(i) for i in input.strip().split(" ")]
# input = [lookup_id(i) for i in input]
input = torch.tensor(input).view(len(input), 1)


with open('out_generation.txt', 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(20):
            output = model(input, False)
            word_weights = output[-1].squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            word_tensor = torch.Tensor([[word_idx]]).long().to(device)
            input = torch.cat([input, word_tensor], 0)
            word = lookup_token(word_idx)
            print(word, end=' ')

he claimed in an interview that replaced lieutenant political dubbed line women took ones influenced charlie important force interviews which founding conclusion image appears killed despite 