In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import WikiText2
import pandas as pd
from nltk.corpus import brown
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import sys
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import os

## Analyze

In [107]:
from net import Net_CBOW

In [118]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
epoch_n = 5
version = "april30_WT2_nodatalim_20epoch_64dim_100minf_4window_epoch"
net = torch.load(f"saves/apr30epochs/model_{version}{epoch_n}.pt", map_location=device)
vocab = torch.load(f"saves/vocab_april30_WT2_nodatalim_20epoch_64dim_50minf_2window.pt")

cpu


In [140]:
# get first layer of the model
embeddings = list(transformer.input_emb.parameters())[0]
# embeddings = torch.load("saves/emb_april22_3000datalim_20epoch.pt")
# vocab = torch.load("saves/vocab_april22_3000datalim_20epoch.pt")
embeddings = embeddings.cpu().detach().numpy()

# normalize the embeddings layer
norms = (embeddings ** 2).sum(axis=1) ** (0.5)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_tsne = tsne.fit_transform(embeddings_df)
embeddings_df_tsne = pd.DataFrame(embeddings_df_tsne)

embeddings_df_tsne.index = vocab.keys()


KeyboardInterrupt: 

In [135]:
numeric = embeddings_df_tsne.index.str.isnumeric()
color = np.where(numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_tsne[0],
        y=embeddings_df_tsne[1],
        mode="text",
        text=embeddings_df_tsne.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
# fig.write_html("word2vec_visualization.html")
fig


In [10]:
def lookup_id(word, vocab=vocab):
    if word not in vocab:
        return vocab["<unk>"]
    return vocab[word]
def lookup_token(word_id, vocab=vocab):
    for word in vocab:
        if vocab[word] == word_id:
            return word
    return None
def get_top_similar(word: str, topN: int = 10):
    if word not in vocab:
        print("Out of vocabulary word")
        return
    word_id = lookup_id(word)

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        # sim_word = vocab.lookup_token(sim_word_id)
        sim_word = "<unk_>"
        for k in vocab:
            if vocab[k] == sim_word_id:
                sim_word = k
                break
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [63]:
for word, sim in get_top_similar("girl").items():
    print("{}: {:.3f}".format(word, sim))


woman: 0.355
boy: 0.353
soldier: 0.332
man: 0.305
girls: 0.291
dog: 0.272
doctor: 0.268
hell: 0.260
singing: 0.247
cake: 0.242


In [81]:
emb1 = embeddings[vocab["mother"]]
emb2 = embeddings[vocab["father"]]
emb3 = embeddings[vocab["son"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(lookup_token(word_id), dists[word_id]))

son: 0.706
mother: 0.551
daughter: 0.372
friend: 0.320
side: 0.296


In [102]:
def analogy(worda, wordA, wordb, n=5, include_inputs = False):
    emba = embed(worda)
    embA = embed(wordA)
    embb = embed(wordb)

    embB = embA - emba + embb
    embB = normalize(embB)

    embB = np.reshape(embB, (len(embB), 1))
    dists = np.matmul(embeddings_norm, embB).flatten()

    topn = np.argsort(-dists)[:n+3]
    index = 0
    count = 0
    out = []
    while count < n:
        word_id = topn[index]
        if include_inputs or (lookup_token(word_id) not in [worda, wordA, wordb]):
            out.append((lookup_token(word_id), dists[word_id]))
            print("{}: {:.3f}".format(lookup_token(word_id), dists[word_id]))
            count += 1
        index += 1

analogy("mother", "father", "daughter", include_inputs=False)

owner: 0.400
son: 0.369
governor: 0.328
mayor: 0.311
archbishop: 0.306


In [99]:
def normalize(emb):
    emb_norm = (emb ** 2).sum() ** (1 / 2)
    return emb / emb_norm

def embed(word):
    return torch.tensor(embeddings[lookup_id(word)])

a = normalize(embed("man")-embed("woman"))
b = normalize(embed("boy")-embed("girl"))
c = normalize(embed("king")-embed("queen"))
d = normalize(embed("father")-embed("mother"))
cos(a,d)


tensor(-0.0155)

In [97]:
folder = "train_data/"
version1 = "_data_"
version2 = "_wt2_window4.pt"

In [98]:
x_test = torch.load(f"{folder}test{version1}x{version2}")
y_test = torch.load(f"{folder}test{version1}y{version2}")

In [99]:
x_test[0]

[629, 4, 22, 29, 108, 1, 392, 5]

In [100]:
lookup_token(y_test[1])

'film'

In [101]:
[lookup_token(i) for i in x_test[3]]

['an', 'english', 'film', ',', 'and', 'theatre', 'actor', '.']

In [137]:
vocab = torch.load("saves/vocab_may1_WT2_transformer_min25f.pt")
print(len(vocab))
def lookup_id(word, vocab=vocab):
    if word not in vocab:
        return vocab["<unk>"]
    return vocab[word]
def lookup_token(word_id, vocab=vocab):
    for word in vocab:
        if vocab[word] == word_id:
            return word
    return "<unk>"

6908


In [190]:

from datasets import load_dataset
wikitext2 = load_dataset("wikitext", "wikitext-2-v1")
# text_train = wikitext2["train"]['text']
# text_train = [item.lower().strip() for item in text_train if len(item) > 0]
text_test = wikitext2["test"]['text']
text_test = [item.lower().strip() for item in text_test if len(item) > 0]
len(text_test)
# text_train = [item.split(" ") + ["\n"] for item in text_train if "=" not in item]
text_test = [item.split(" ") + ["\n"] for item in text_test if "=" not in item]

seq_length = 128
buffer = 20

x_test = [[lookup_id(word) for word in paragraph[len(paragraph)-seq_length-1:len(paragraph)-1]] for paragraph in text_test if len(paragraph) >= seq_length + buffer+1]

trytest = [lookup_token(i) for i in x_test[-1]]

In [24]:
from transformer import TransformerModel


In [136]:
# transformer = torch.load("saves/model_transformer_apr29_1130pm.pt")
# transformer = torch.load("saves/model_transformer_may1_1250pm.pt")
transformer = torch.load("saves/model_transformer_may5_0100am.pt")
vocab = torch.load(f"saves/vocab_may1_WT2_transformer_min25f.pt")

In [127]:
def embed(word):    
    return transformer.input_emb(torch.tensor(lookup_id(word)))
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
len(embed("mother"))


512

In [128]:

device = torch.device("cpu")
ntokens = len(vocab)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
model = transformer
temperature = 0.5
log_interval = 1

input = 'he claimed in an interview that '
input = [lookup_id(i) for i in input.strip().split(" ")]
# input = [lookup_id(i) for i in input]


print(input)
input = torch.tensor(input).view(len(input), 1)


with open('out_generation.txt', 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(20):
            print(i)
            output = model(input, False)
            word_weights = output[-1].squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            word_tensor = torch.Tensor([[word_idx]]).long().to(device)
            print(word_tensor)
            input = torch.cat([input, word_tensor], 0)

            word = lookup_token(word_idx)

            outf.write(word + ('\n' if i % 20 == 19 else ' '))

            if i % log_interval == 0:
                print('| Generated {}/{} words'.format(i, [lookup_token(i[0])for i in input]))

[27, 760, 6, 29, 1128, 15]
0
tensor([[4]])
| Generated 0/['he', 'claimed', 'in', 'an', 'interview', 'that', '<unk>'] words
1
tensor([[7]])
| Generated 1/['he', 'claimed', 'in', 'an', 'interview', 'that', '<unk>', 'to'] words
2
tensor([[286]])
| Generated 2/['he', 'claimed', 'in', 'an', 'interview', 'that', '<unk>', 'to', 'do'] words
3
tensor([[39]])
| Generated 3/['he', 'claimed', 'in', 'an', 'interview', 'that', '<unk>', 'to', 'do', 'not'] words
4
tensor([[50]])
| Generated 4/['he', 'claimed', 'in', 'an', 'interview', 'that', '<unk>', 'to', 'do', 'not', 'have'] words
5
tensor([[52]])
| Generated 5/['he', 'claimed', 'in', 'an', 'interview', 'that', '<unk>', 'to', 'do', 'not', 'have', 'been'] words
6
tensor([[4]])
| Generated 6/['he', 'claimed', 'in', 'an', 'interview', 'that', '<unk>', 'to', 'do', 'not', 'have', 'been', '<unk>'] words
7
tensor([[4]])
| Generated 7/['he', 'claimed', 'in', 'an', 'interview', 'that', '<unk>', 'to', 'do', 'not', 'have', 'been', '<unk>', '<unk>'] words
8
te

In [141]:
# get first layer of the model
embeddings = list(transformer.input_emb.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalize the embeddings layer
norms = (embeddings ** 2).sum(axis=1) ** (0.5)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_tsne = tsne.fit_transform(embeddings_df)
embeddings_df_tsne = pd.DataFrame(embeddings_df_tsne)

embeddings_df_tsne.index = vocab.keys()

In [142]:
numeric = embeddings_df_tsne.index.str.isnumeric()
color = np.where(numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_tsne[0],
        y=embeddings_df_tsne[1],
        mode="text",
        text=embeddings_df_tsne.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
# fig.write_html("word2vec_visualization.html")
fig


In [56]:
# Compression to 64 dimensions
# t-SNE transform
# tsne_64 = TSNE(n_components=64, method="")
# emb_64 = tsne_64.fit_transform(embeddings_df)
# emb_64 = pd.DataFrame(emb_64)

# emb_64.index = vocab.keys()
# emb_64.head()

In [125]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007.100,00, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp.tokenizer(text)
[str(w) for w in doc]

['When',
 'Sebastian',
 'Thrun',
 'started',
 'working',
 'on',
 'self',
 '-',
 'driving',
 'cars',
 'at',
 'Google',
 'in',
 '2007.100,00',
 ',',
 'few',
 'people',
 'outside',
 'of',
 'the',
 'company',
 'took',
 'him',
 'seriously',
 '.',
 '“',
 'I',
 'can',
 'tell',
 'you',
 'very',
 'senior',
 'CEOs',
 'of',
 'major',
 'American',
 'car',
 'companies',
 'would',
 'shake',
 'my',
 'hand',
 'and',
 'turn',
 'away',
 'because',
 'I',
 'was',
 'n’t',
 'worth',
 'talking',
 'to',
 ',',
 '”',
 'said',
 'Thrun',
 ',',
 'in',
 'an',
 'interview',
 'with',
 'Recode',
 'earlier',
 'this',
 'week',
 '.']