In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import WikiText2
import pandas as pd
from nltk.corpus import brown
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import sys
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import os

## Analyze

In [107]:
from net import Net_CBOW

In [118]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
epoch_n = 5
version = "april30_WT2_nodatalim_20epoch_64dim_100minf_4window_epoch"
net = torch.load(f"saves/apr30epochs/model_{version}{epoch_n}.pt", map_location=device)
vocab = torch.load(f"saves/vocab_april30_WT2_nodatalim_20epoch_64dim_50minf_2window.pt")

cpu


In [119]:
# get first layer of the model
embeddings = list(net.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalize the embeddings layer
norms = (embeddings ** 2).sum(axis=1) ** (0.5)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_tsne = tsne.fit_transform(embeddings_df)
embeddings_df_tsne = pd.DataFrame(embeddings_df_tsne)

embeddings_df_tsne.index = vocab.keys()


In [120]:
numeric = embeddings_df_tsne.index.str.isnumeric()
color = np.where(numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_tsne[0],
        y=embeddings_df_tsne[1],
        mode="text",
        text=embeddings_df_tsne.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
# fig.write_html("word2vec_visualization.html")
fig


In [10]:
def lookup_id(word, vocab=vocab):
    if word not in vocab:
        return vocab["<unk>"]
    return vocab[word]
def lookup_token(word_id, vocab=vocab):
    for word in vocab:
        if vocab[word] == word_id:
            return word
    return None
def get_top_similar(word: str, topN: int = 10):
    if word not in vocab:
        print("Out of vocabulary word")
        return
    word_id = lookup_id(word)

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        # sim_word = vocab.lookup_token(sim_word_id)
        sim_word = "<unk_>"
        for k in vocab:
            if vocab[k] == sim_word_id:
                sim_word = k
                break
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [38]:
for word, sim in get_top_similar("shelter").items():
    print("{}: {:.3f}".format(word, sim))


cars: 0.240
schools: 0.210
position: 0.204
damage: 0.199
sequences: 0.197
markets: 0.186
island: 0.180
rates: 0.180
fans: 0.177
education: 0.175


In [33]:
emb1 = embeddings[vocab["things"]]
emb2 = embeddings[vocab["thing"]]
emb3 = embeddings[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(lookup_token(word_id), dists[word_id]))

woman: 0.637
things: 0.497
players: 0.241
team: 0.212
people: 0.195


In [97]:
folder = "train_data/"
version1 = "_data_"
version2 = "_wt2_window4.pt"

In [98]:
x_test = torch.load(f"{folder}test{version1}x{version2}")
y_test = torch.load(f"{folder}test{version1}y{version2}")

In [99]:
x_test[0]

[629, 4, 22, 29, 108, 1, 392, 5]

In [100]:
lookup_token(y_test[1])

'film'

In [101]:
[lookup_token(i) for i in x_test[3]]

['an', 'english', 'film', ',', 'and', 'theatre', 'actor', '.']

In [23]:
vocab = torch.load("saves/vocab_may1_WT2_transformer_min25f.pt")
print(len(vocab))
def lookup_id(word, vocab=vocab):
    if word not in vocab:
        return vocab["<unk>"]
    return vocab[word]
def lookup_token(word_id, vocab=vocab):
    for word in vocab:
        if vocab[word] == word_id:
            return word
    return "<unk>"

6908


In [190]:

from datasets import load_dataset
wikitext2 = load_dataset("wikitext", "wikitext-2-v1")
# text_train = wikitext2["train"]['text']
# text_train = [item.lower().strip() for item in text_train if len(item) > 0]
text_test = wikitext2["test"]['text']
text_test = [item.lower().strip() for item in text_test if len(item) > 0]
len(text_test)
# text_train = [item.split(" ") + ["\n"] for item in text_train if "=" not in item]
text_test = [item.split(" ") + ["\n"] for item in text_test if "=" not in item]

seq_length = 128
buffer = 20

x_test = [[lookup_id(word) for word in paragraph[len(paragraph)-seq_length-1:len(paragraph)-1]] for paragraph in text_test if len(paragraph) >= seq_length + buffer+1]

trytest = [lookup_token(i) for i in x_test[-1]]

In [24]:
from transformer import TransformerModel


In [25]:
# transformer = torch.load("saves/model_transformer_apr29_1130pm.pt")
# transformer = torch.load("saves/model_transformer_may1_1250pm.pt")
transformer = torch.load("saves/model_transformer_may3_1200am.pt")
vocab = torch.load(f"saves/vocab_may1_WT2_transformer_min25f.pt")

In [26]:
def embed(word):    
    return transformer.input_emb(torch.tensor(lookup_id(word)))
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
len(embed("mother"))


512

In [27]:

device = torch.device("cpu")
ntokens = len(vocab)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
model = transformer
temperature = 2
log_interval = 1

input = 'he claimed in an interview that '
# input = ['<unk>', 'claimed', 'in', 'an', 'interview', 'at', 'the', 'time', 'of', 'the', 'film', "'s", 'release', 'that', 'the', 'character', 'of', 'minnesota', '<unk>', 'was', 'based', 'on', '<unk>', ',', 'who', 'at', 'the', 'time', 'was', 'known', 'as', '"', 'new', 'york', '<unk>', '"', '.', '<unk>', 'immediately', 'adopted', 'the', 'minnesota', '<unk>', '<unk>', 'and', '<unk>', 'his', 'association', 'with', 'the', 'film', 'into', 'book', 'and', 'television', '<unk>', 'and', 'other', '<unk>', '.', 'author', 'walter', '<unk>', 'denied', 'for', 'the', 'rest', 'of', 'his', 'life', 'that', '<unk>', 'had', 'played', 'any', 'role', 'in', 'the', 'creation', 'of', 'the', 'character', '.', 'other', 'players', 'would', 'claim', ',', 'with', 'greater', 'or', '<unk>', 'degrees', 'of', '<unk>', ',', 'to', 'have', 'served', 'as']
input = [lookup_id(i) for i in input.strip().split(" ")]
# input = [lookup_id(i) for i in input]


print(input)
input = torch.tensor(input).view(len(input), 1)


with open('out_generation.txt', 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(20):
            print(i)
            output = model(input, False)
            word_weights = output[-1].squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            word_tensor = torch.Tensor([[word_idx]]).long().to(device)
            print(word_tensor)
            input = torch.cat([input, word_tensor], 0)

            word = lookup_token(word_idx)

            outf.write(word + ('\n' if i % 20 == 19 else ' '))

            if i % log_interval == 0:
                print('| Generated {}/{} words'.format(i, [lookup_token(i[0])for i in input]))

[27, 760, 6, 29, 1128, 15]
0
tensor([[2479]])
| Generated 0/['he', 'claimed', 'in', 'an', 'interview', 'that', 'boys'] words
1
tensor([[1978]])
| Generated 1/['he', 'claimed', 'in', 'an', 'interview', 'that', 'boys', 'finding'] words
2
tensor([[6624]])
| Generated 2/['he', 'claimed', 'in', 'an', 'interview', 'that', 'boys', 'finding', 'nests'] words
3
tensor([[634]])
| Generated 3/['he', 'claimed', 'in', 'an', 'interview', 'that', 'boys', 'finding', 'nests', 'european'] words
4
tensor([[2138]])
| Generated 4/['he', 'claimed', 'in', 'an', 'interview', 'that', 'boys', 'finding', 'nests', 'european', 'championships'] words
5
tensor([[309]])
| Generated 5/['he', 'claimed', 'in', 'an', 'interview', 'that', 'boys', 'finding', 'nests', 'european', 'championships', 'history'] words
6
tensor([[5767]])
| Generated 6/['he', 'claimed', 'in', 'an', 'interview', 'that', 'boys', 'finding', 'nests', 'european', 'championships', 'history', 'addressed'] words
7
tensor([[5780]])
| Generated 7/['he', 'cla

In [28]:
# get first layer of the model
embeddings = list(transformer.input_emb.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalize the embeddings layer
norms = (embeddings ** 2).sum(axis=1) ** (0.5)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_tsne = tsne.fit_transform(embeddings_df)
embeddings_df_tsne = pd.DataFrame(embeddings_df_tsne)

embeddings_df_tsne.index = vocab.keys()

Parameter containing:
tensor([[-0.0191, -0.0145, -0.0130,  ...,  0.0179, -0.0653, -0.1022],
        [-0.0649, -0.0674, -0.0473,  ..., -0.0198,  0.0296, -0.0329],
        [ 0.0637, -0.0933, -0.0536,  ..., -0.0213, -0.0408,  0.0238],
        ...,
        [-0.0507,  0.0713, -0.0390,  ..., -0.0244,  0.0218,  0.0387],
        [ 0.1066, -0.0762, -0.0597,  ..., -0.0664,  0.0372, -0.0195],
        [-0.0205,  0.0837, -0.0577,  ...,  0.0293,  0.0338,  0.0709]],
       requires_grad=True)


In [42]:
numeric = embeddings_df_tsne.index.str.isnumeric()
color = np.where(numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_tsne[0],
        y=embeddings_df_tsne[1],
        mode="text",
        text=embeddings_df_tsne.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
# fig.write_html("word2vec_visualization.html")
fig


In [41]:
# Compression to 64 dimensions
# t-SNE transform
# tsne_64 = TSNE(n_components=64, method="")
# emb_64 = tsne_64.fit_transform(embeddings_df)
# emb_64 = pd.DataFrame(emb_64)

# emb_64.index = vocab.keys()
# emb_64.head()

KeyboardInterrupt: 