In [2]:
import torch
import numpy as np
import torch.functional as F
import torch.nn as nn
from nltk import ngrams
from IPython.display import display
import pandas as pd
from tqdm import tqdm

from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize
from gensim.models import KeyedVectors
from utils import get_distinct_words, read_corpus
from itertools import chain

In [71]:
print(torch.device('cuda:1'))
print(torch.cuda.device(0))

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

cuda:1
<torch.cuda.device object at 0x7f5edeb2f220>


'cuda:0'

In [4]:
min_count = 2
ru_corpus_cp = read_corpus("ru_copy")
index_to_key, word_counter = get_distinct_words(ru_corpus_cp, min_count=min_count)
index_to_key = ["UNK", "PAD"] + index_to_key
key_to_index = {word: i for i, word in enumerate(index_to_key)}

In [5]:
len(ru_corpus_cp), ru_corpus_cp[:2]

(8,
 [['кстати',
   'как',
   'неожиданно',
   'кпрф',
   'становиться',
   'не',
   'все',
   'равный',
   'на',
   'судьба',
   'фермер',
   'именно',
   'накануне',
   'выборы'],
  ['можно',
   'и',
   'по',
   'другому',
   'сказать',
   'убогий',
   'клоунада',
   'кпрф',
   'это',
   'попытка',
   'отвечать',
   'на',
   'запрос',
   'молодой',
   'поколение',
   'левый',
   'не',
   'питать',
   'иллюзия',
   'по',
   'повод',
   'коммунистический',
   'номенклатура',
   'советский',
   'образец',
   'но',
   'в',
   'сила',
   'свой',
   'положение',
   'под',
   'давление',
   'вызов',
   'время',
   'они',
   'вынуждать',
   'быть',
   'меняться']])

In [6]:
def as_matrix(sequences, key_to_index, UNK="UNK", PAD="PAD", max_len=None):
    """ Convert a list of tokens into a matrix with padding """
    if isinstance(sequences[0], str):
        sequences = [x.split() for x in sequences]

    max_sequence_len = max([len(x) for x in sequences])
    if max_len is not None and max_sequence_len > max_len :
        max_sequence_len = max_len

    matrix = np.full((len(sequences), max_sequence_len), np.int32(key_to_index[PAD]))
    for i, seq in enumerate(sequences):
        row_ix = [key_to_index.get(word, key_to_index[UNK]) for word in seq[:max_sequence_len]]
        matrix[i, :len(row_ix)] = row_ix

    return matrix

In [51]:
import pickle
with open("ru_corpus_list", "rb") as fp:
    ru_corpus = pickle.load(fp)

In [7]:
display(len(ru_corpus_cp))
# display(as_matrix(ru_corpus_cp, key_to_index, max_len=10))
len(list(chain.from_iterable(ru_corpus_cp)))

8

309

In [8]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def pad_text(text: list, window_size: int, pad: str):
    appendix = [pad] * window_size

    return appendix + text + appendix

In [9]:
class BaseEmbeddings(KeyedVectors):
    def __init__(self, corpus, distinct_words=None, word_counter=None, vector_size=100, min_count=10):
        super().__init__(vector_size=vector_size)
        
        self.index_to_key = distinct_words
        self.word_counter = word_counter
        if distinct_words is None or word_counter is None:
            self.index_to_key, self.word_counter = get_distinct_words(corpus, min_count=min_count)
    
        self.key_to_index = {word: i for i, word in enumerate(self.index_to_key)}

In [10]:
np.random.choice(50, 20)


array([30, 36, 25,  9, 44, 41, 12, 30, 26, 30, 30, 19, 27, 23, 25, 18,  9,
        6, 31, 44])

In [11]:
def softmax(u):
    return torch.tensor([torch.exp(u_j) / torch.sum(torch.exp(u)) for u_j in u])

In [74]:
class Word2Vec(BaseEmbeddings):
    def __init__(self, corpus, distinct_words=None, vector_size=100, window_size=5,
                 min_count=10, batch_size=None, n_negative=5, n_epoches=5):
        super().__init__(corpus, vector_size=vector_size, distinct_words=distinct_words, min_count=min_count)

        self.W1 = torch.randn((len(self.index_to_key), vector_size), device=device, requires_grad=True)  #, device=torch.cuda.device(0))  # vocab_size, vector_size
        self.W2 = torch.randn((vector_size, len(self.index_to_key)), device=device, requires_grad=True)  #, device=torch.cuda.device(0))  # vector_size, vocab_size

        self.corpus = corpus
        self.window_size = window_size
        self.batch_size = batch_size
        if batch_size is None:
            self.batch_size = np.max([len(text) for text in corpus])
        self.n_negative = n_negative
        self.alpha = 0.0001

        self.optimizer = torch.optim.Adam([self.W1, self.W2], lr=self.alpha)
        
        self.train(n_epoches)
        self.vectors = self.W1

    def one_hot_vector(self, word: str):
        vector = torch.zeros(len(self.index_to_key), device=device)
        vector[self.key_to_index[word]] = 1

        return vector
    
    def forward(self, center, window):
        h = self.one_hot_vector(center) @ self.W1      # 1, vec_size = 1, vocab_size x vocab_size, vec_size

        k_neg = np.random.choice(len(self.index_to_key), self.n_negative)
        W2_neg = self.W2[:, k_neg]                     # vec_size, k_neg  
        neg_sum = torch.sum(torch.exp(h @ W2_neg))     # sum(1, k_neg) = 1, vec_size x vec_size, k_neg


        u_c = torch.sum(torch.tensor([h @ self.W2[:, self.key_to_index[context]] for context in window]))  # 1, vec_size x vec_size, 1

        loss = -u_c + self.n_negative * neg_sum
        return loss
    

    def train(self, n_epoches=5):
        """
        trains self.center_W and self.context_W matrices
        """
        self.losses = []
        for epoch in tqdm(range(n_epoches)):
            epoch_loss = 0
            for t, text in enumerate(self.corpus):
                if t % 100 == 0 and t != 0:
                    print(f"texts pass: {t}\tloss: {loss}")
                for batch in chunks(text, self.batch_size):
                    
                    for j, center in enumerate(batch):
                        if center in self.index_to_key:
                            self.optimizer.zero_grad()

                            window = [batch[i + j] for i in range(-self.window_size, self.window_size + 1, 1) if i + j >= 0 and i + j < len(batch) and i != 0 and batch[i + j] in self.index_to_key]

                            if len(window) != 0:
                                loss = self.forward(center, window)
                                
                                epoch_loss += loss

                                loss.backward(retain_graph=True)
                                self.optimizer.step()

            print(loss)
            self.losses.append(epoch_loss)

w2v = Word2Vec(ru_corpus) # , min_count=2, window_size=3, n_epoches=10)

  0%|          | 0/5 [00:00<?, ?it/s]

texts pass: 100	loss: 11069.63671875
texts pass: 200	loss: 34508800.0
texts pass: 300	loss: 21.95401382446289
texts pass: 400	loss: 137.4266815185547
texts pass: 500	loss: 2649.57373046875
texts pass: 600	loss: 17090352.0
texts pass: 700	loss: 5576.408203125
texts pass: 800	loss: 47684120.0
texts pass: 900	loss: 823.6334838867188
texts pass: 1000	loss: 395664.78125
texts pass: 1100	loss: 93760.515625


  0%|          | 0/5 [15:51<?, ?it/s]


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [70]:
len(ru_corpus)

306645