In [14]:
import sys
sys.path.append("../src/")

import time
import torch
import pickle as pkl
import torch.nn as nn
import torch.optim as optim
from vocabulary import Vocabulary
from utils import get_next_batch

DEBUG = True


In [11]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32):
        super().__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        projections = self.embeddings.forward(inputs)
        output = self.out_layer.forward(projections)
        return output
      


In [12]:
with open("../data/prepared.pkl", "rb") as fp:
    prepared = pkl.load(fp)
vocabulary = prepared["vocabulary"]
texts = prepared["texts"]
contexts = prepared["contexts"]
test_texts = prepared["test_texts"]
del prepared


In [15]:
model = SkipGramModel(vocabulary.size, 32)

device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
model = model.to(device)

loss_every_nsteps = 1000
total_loss = 0
start_time = time.time()
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_function = nn.CrossEntropyLoss().to(device)

for step, (batch_contexts, batch_centrals) in enumerate(get_next_batch(contexts, window_size=2, batch_size=256, epochs_count=10)):
    logits = model(batch_centrals) # Прямой проход
    loss = loss_function(logits, batch_contexts) # Подсчёт ошибки
    loss.backward() # Подсчёт градиентов dL/dw
    optimizer.step() # Градиентный спуск или его модификации (в данном случае Adam)
    optimizer.zero_grad() # Зануление градиентов, чтобы их спокойно менять на следующей итерации
    total_loss += loss.item()
    if step != 0 and step % loss_every_nsteps == 0:
        print("Step = {}, Avg Loss = {:.4f}, Time = {:.2f}s".format(step, total_loss / loss_every_nsteps, time.time() - start_time))
        total_loss = 0
        start_time = time.time()
    if DEBUG and 2000 < step:
        break


Step = 1000, Avg Loss = 8.9814, Time = 52.18s


In [20]:
logits.shape

torch.Size([256, 71186])

In [28]:
batch_contexts.shape

torch.Size([256])

In [30]:
batch_centrals

tensor([   33,    33,    33,    33, 18367, 18367, 18367, 18367,   688,   688,
          688,   688,  3696,  3696,  3696,  3696, 15428, 15428, 15428, 15428,
        14067, 14067, 14067, 14067,   263,   263,   263,   263, 42462, 42462,
        42462, 42462,  7030,  7030,  7030,  7030,     0,     0,     0,     0,
        12347, 12347, 12347, 12347,   241,   241,   241,   241,    27,    27,
           27,    27,  2978,  2978,  2978,  2978,    30,    30,    30,    30,
            1,     1,     1,     1,  3539,  3539,  3539,  3539,     4,     4,
            4,     4,  2147,  2147,  2147,  2147,     0,     0,     0,     0,
        25910, 25910, 25910, 25910,     0,     0,     0,     0,  1526,  1526,
         1526,  1526,     2,     2,     2,     2,  5788,  5788,  5788,  5788,
         5317,  5317,  5317,  5317,  7744,  7744,  7744,  7744,  3301,  3301,
         3301,  3301,  2362,  2362,  2362,  2362, 35234, 35234, 35234, 35234,
            1,     1,     1,     1,  2088,  2088,  2088,  2088, 

In [16]:
embeddings = model.embeddings.weight.cpu().data.numpy()


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def most_similar(embeddings, vocabulary, word):
    word_emb = embeddings[vocabulary.get_index(word)]
    similarities = cosine_similarity([word_emb], embeddings)[0]
    top10 = np.argsort(similarities)[-10:]
    return [vocabulary.get_word(index) for index in reversed(top10)]

most_similar(embeddings, vocabulary, 'путин')

['путин',
 'цветочный',
 'злоумышленников',
 'подушку',
 'мэддокса',
 'титрах',
 'осудили',
 'шаг',
 'одетых',
 'пикеты']

In [18]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


def draw_vectors(
    x, y, 
    radius=10, 
    alpha=0.25, 
    color='blue',
    width=600, 
    height=400, 
    show=True, 
    **kwargs
):
    """ draws an interactive plot for data points with auxilirary info on hover """
    output_notebook()
    if isinstance(color, str): 
        color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)
    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: 
        pl.show(fig)
    return fig


def get_tsne_projection(word_vectors):
    tsne = TSNE(n_components=2)
    return scale(tsne.fit_transform(word_vectors))

def get_pca_projection(word_vectors):
    pca = PCA(n_components=2)
    return scale(pca.fit_transform(word_vectors))
    
    
def visualize_embeddings(embeddings, vocabulary, word_count, method="pca"):
    word_vectors = embeddings[1: word_count + 1]
    words = vocabulary.top(word_count)
    get_projections = get_pca_projection if method == "pca" else get_tsne_projection
    projections = get_projections(word_vectors)
    draw_vectors(projections[:, 0], projections[:, 1], color='green', token=words)
    
    
visualize_embeddings(embeddings, vocabulary, 500, method="tsne")

