# Задание 5.2 - Word2Vec with Negative Sampling

В этом задании мы натренируем свои версию word vectors с negative sampling на том же небольшом датасете.


Несмотря на то, что основная причина использования Negative Sampling - улучшение скорости тренировки word2vec, в нашем игрушечном примере мы **не требуем** улучшения производительности. Мы используем negative sampling просто как дополнительное упражнение для знакомства с PyTorch.

Перед запуском нужно запустить скрипт `download_data.sh`, чтобы скачать данные.

Датасет и модель очень небольшие, поэтому это задание можно выполнить и без GPU.



In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

from torchvision import transforms

import numpy as np
import matplotlib.pyplot as plt

# We'll use Principal Component Analysis (PCA) to visualize word vectors,
# so make sure you install dependencies from requirements.txt!
from sklearn.decomposition import PCA 

%matplotlib inline

In [2]:
import os
import numpy as np

class StanfordTreeBank:
    '''
    Wrapper for accessing Stanford Tree Bank Dataset
    https://nlp.stanford.edu/sentiment/treebank.html
    
    Parses dataset, gives each token and index and provides lookups
    from string token to index and back
    
    Allows to generate random context with sampling strategy described in
    word2vec paper:
    https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf
    '''
    def __init__(self):
        self.index_by_token = {} # map of string -> token index
        self.token_by_index = []

        self.sentences = []

        self.token_freq = {}
        
        self.token_reject_by_index = None

    def load_dataset(self, folder):
        filename = os.path.join(folder, "datasetSentences.txt")

        with open(filename, "r", encoding="latin1") as f:
            l = f.readline() # skip the first line
            
            for l in f:
                splitted_line = l.strip().split()
                words = [w.lower() for w in splitted_line[1:]] # First one is a number
                    
                self.sentences.append(words)
                for word in words:
                    if word in self.token_freq:
                        self.token_freq[word] +=1 
                    else:
                        index = len(self.token_by_index)
                        self.token_freq[word] = 1
                        self.index_by_token[word] = index
                        self.token_by_index.append(word)
        self.compute_token_prob()
                        
    def compute_token_prob(self):
        words_count = np.array([self.token_freq[token] for token in self.token_by_index])
        words_freq = words_count / np.sum(words_count)
        
        # Following sampling strategy from word2vec paper
        self.token_reject_by_index = 1- np.sqrt(1e-5/words_freq)
    
    def check_reject(self, word):
        return np.random.rand() > self.token_reject_by_index[self.index_by_token[word]]
        
    def get_random_context(self, context_length=5):
        """
        Returns tuple of center word and list of context words
        """
        sentence_sampled = []
        while len(sentence_sampled) <= 2:
            sentence_index = np.random.randint(len(self.sentences)) 
            sentence = self.sentences[sentence_index]
            sentence_sampled = [word for word in sentence if self.check_reject(word)]
    
        center_word_index = np.random.randint(len(sentence_sampled))
        
        words_before = sentence_sampled[max(center_word_index - context_length//2,0):center_word_index]
        words_after = sentence_sampled[center_word_index+1: center_word_index+1+context_length//2]
        
        return sentence_sampled[center_word_index], words_before+words_after
    
    def num_tokens(self):
        return len(self.token_by_index)
        
data = StanfordTreeBank()
data.load_dataset("stanfordSentimentTreebank/stanfordSentimentTreebank/")

print("Num tokens:", data.num_tokens())
for i in range(5):
    center_word, other_words = data.get_random_context(5)
    print(center_word, other_words)

Num tokens: 19538
seconds ['can', 'tiresome', 'jesse', 'helms']
innuendoes ['sexual', 'abound']
career ['deniro', 'once', 'grand', 'beach']
yorkers ['tangled', 'particular', 'touched', 'unprecedented']
surprises ['offers', 'few']


# Dataset для Negative Sampling должен быть немного другим

Как и прежде, Dataset должен сгенерировать много случайных контекстов и превратить их в сэмплы для тренировки.

Здесь мы реализуем прямой проход модели сами, поэтому выдавать данные можно в удобном нам виде.
Напоминаем, что в случае negative sampling каждым сэмплом является:
- вход: слово в one-hot представлении
- выход: набор из одного целевого слова и K других случайных слов из словаря.
Вместо softmax + cross-entropy loss, сеть обучается через binary cross-entropy loss - то есть, предсказывает набор бинарных переменных, для каждой из которых функция ошибки считается независимо.

Для целевого слова бинарное предсказание должно быть позитивным, а для K случайных слов - негативным.

Из набора слово-контекст создается N сэмплов (где N - количество слов в контексте), в каждом из них K+1 целевых слов, для только одного из которых предсказание должно быть позитивным.
Например, для K=2:

Слово: `orders` и контекст: `['love', 'nicest', 'to', '50-year']` создадут 4 сэмпла:
- input: `orders`, target: `[love: 1, any: 0, rose: 0]`
- input: `orders`, target: `[nicest: 1, fool: 0, grass: 0]`
- input: `orders`, target: `[to: 1, -: 0, the: 0]`
- input: `orders`, target: `[50-year: 1, ?: 0, door: 0]`

Все слова на входе и на выходе закодированы через one-hot encoding, с размером вектора равным количеству токенов.

In [3]:
num_negative_samples = 10

class Word2VecNegativeSampling(Dataset):
    '''
    PyTorch Dataset for Word2Vec with Negative Sampling.
    Accepts StanfordTreebank as data and is able to generate dataset based on
    a number of random contexts
    '''
    def __init__(self, data, num_negative_samples, num_contexts=30000):
        '''
        Initializes Word2VecNegativeSampling, but doesn't generate the samples yet
        (for that, use generate_dataset)
        Arguments:
        data - StanfordTreebank instace
        num_negative_samples - number of negative samples to generate in addition to a positive one
        num_contexts - number of random contexts to use when generating a dataset
        '''
        # TODO: Implement what you need for other methods!
        self.data = data
        self.num_contexts = num_contexts
        self.input = []
        self.output = []
        self.num_tokens = self.data.num_tokens()
        self.num_negative_samples = num_negative_samples
        
    
    def generate_dataset(self):
        '''
        Generates dataset samples from random contexts
        Note: there will be more samples than contexts because every context
        can generate more than one sample
        '''
        # TODO: Implement generating the dataset
        # You should sample num_contexts contexts from the data and turn them into samples
        # Note you will have several samples from one context
        q=0
        for i in range(self.num_contexts):
            center_word, other_words = self.data.get_random_context()
            
            for word in other_words:
                q+=1
#                 print("%f SAMPLE" % (q))
                word_indeces_current_output_dict = []
                rand_words = [] 
                self.input.append(center_word)
                current_output_dict ={}
                current_output_dict[word] = 1
                word_indeces_current_output_dict.append(self.data.index_by_token[word])
#                 print(word_indeces_current_output_dict)
                for j in range(self.num_negative_samples):
                    rand_word_index = np.random.randint(self.num_tokens)
#                     print(rand_word_index)
                    while rand_word_index in word_indeces_current_output_dict:
                        rand_word_index = np.random.randint(self.num_tokens)
#                     print(rand_word_index)    
                    word_indeces_current_output_dict.append(rand_word_index)
#                     print(word_indeces_current_output_dict)
                    rand_word = self.data.token_by_index[rand_word_index]
                     
        
        
        
                    rand_words.append(rand_word)
                    current_output_dict[rand_word] = 0 
                self.output.append(current_output_dict)
#                 print(rand_words, word)
#                 print(len(current_output_dict))
             
        
        
        
    def __len__(self):
        '''
        Returns total number of samples
        '''
        # TODO: Return the number of samples
        return len(self.input)

    
    def __getitem__(self, index):
        '''
        Returns i-th sample
        
        Return values:
        input_vector - index of the input word (not torch.Tensor!)
        output_indices - torch.Tensor of indices of the target words. Should be 1+num_negative_samples.
        output_target - torch.Tensor with float targets for the training. Should be the same size as output_indices
                        and have 1 for the context word and 0 everywhere else
        '''
        # TODO: Generate tuple of 3 return arguments for i-th sample
        input_word = self.input[index]
        input_vector = self.data.index_by_token[input_word]
        output_dict = self.output[index]
        output_indices = torch.Tensor([self.data.index_by_token[output_word] for output_word in output_dict.keys()]).int()
        output_target = torch.Tensor([output_target for output_target in output_dict.values()]).int()
                      
        return input_vector, output_indices, output_target
        

dataset = Word2VecNegativeSampling(data, num_negative_samples, 10000)
dataset.generate_dataset()
input_vector, output_indices, output_target = dataset[0]

print("Sample - input: %s, output indices: %s, output target: %s" % (input_vector, output_indices, output_target)) # target should be able to convert to int
assert isinstance(output_indices, torch.Tensor)
assert output_indices.shape[0] == num_negative_samples+1

assert isinstance(output_target, torch.Tensor)
assert output_target.shape[0] == num_negative_samples+1
assert torch.sum(output_target) == 1.0

Sample - input: 10927, output indices: tensor([  803,  6728,  3508,  9825, 10020,   578,  1099, 13574, 16968, 19280,
        18837], dtype=torch.int32), output target: tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)


# Создаем модель

Для нашей задачи нам придется реализовать свою собственную PyTorch модель.
Эта модель реализует свой собственный прямой проход (forward pass), который получает на вход индекс входного слова и набор индексов для выходных слов. 

Как всегда, на вход приходит не один сэмпл, а целый batch.  
Напомним, что цели улучшить скорость тренировки у нас нет, достаточно чтобы она сходилась.

In [216]:
# Create the usual PyTorch structures
dataset = Word2VecNegativeSampling(data, num_negative_samples, 30000)
dataset.generate_dataset()

# As before, we'll be training very small word vectors!
wordvec_dim = 10

class Word2VecNegativeSamples(nn.Module):
    def __init__(self, num_tokens):
        super(Word2VecNegativeSamples, self).__init__()
        self.input = nn.Linear(num_tokens, 10, bias=False)
        self.output = nn.Linear(10, num_tokens, bias=False)
        self.num_tokens = num_tokens
        
    def forward(self, input_index_batch, output_indices_batch):
        '''
        Implements forward pass with negative sampling
        
        Arguments:
        input_index_batch - Tensor of ints, shape: (batch_size, ), indices of input words in the batch
        output_indices_batch - Tensor if ints, shape: (batch_size, num_negative_samples+1),
                                indices of the target words for every sample
                                
        Returns:
        predictions - Tensor of floats, shape: (batch_size, num_negative_samples+1)
        '''
        predictions = torch.Tensor([]).float().requires_grad_()
        
#         predictions = torch.zeros([len(input_index_batch), output_indices_batch.shape[1]]).float()
        
        # TODO Implement forward pass
        # Hint: You can use for loop to go over all samples on the batch,
        # run every sample indivisually and then use
        # torch.stack or torch.cat to produce the final result
#         for n_sample in range(len(input_index_batch)):
#             input_vector = torch.zeros([1, self.num_tokens]).float()
#             input_word_index = input_index_batch[n_sample]
#             input_vector[0, input_word_index] = 1

#             pred_1 = self.input(input_vector)

#             pred_2 = self.output(pred_1)

#             output_indeces = output_indices_batch[n_sample,:]

#             output_vector = torch.index_select(pred_2, 1, output_indeces) 
            
#             predictions = torch.cat((predictions, output_vector), 0) 
        
    
    
              
#         input_ = torch.zeros([len(input_index_batch), self.num_tokens]).float()
#         input_[range(len(input_index_batch)), input_index_batch ] = 1
#         pred_1 = self.input(input_)
#         pred_2 = self.output(pred_1)
#         for row in range(len(input_index_batch)):
#             predictions[row] = pred_2[row][output_indices_batch[row].tolist()]
    
        output_indices_batch = output_indices_batch.type('torch.LongTensor') 
        w1 = torch.t([(param*1) for param in self.input.parameters()][0])
#         print(w1.requires_grad, w1.is_leaf)
        w2 = [(param*1) for param in self.output.parameters()][0]
#         print(w2.requires_grad, w2.is_leaf)
        for n_sample in range(output_indices_batch.shape[0]):
            m1 = w1[input_index_batch[n_sample]].reshape(1, w1.shape[1])
#             print(m1.requires_grad, m1.is_leaf)
            output_ind = output_indices_batch[n_sample] 
            m2 = torch.t(w2[output_ind])
#             print(m2.requires_grad, m2.is_leaf)
            pred = torch.matmul(m1,m2)
#             print(pred.requires_grad, pred.is_leaf)
            predictions = torch.cat((predictions, pred), 0)
#             print(predictions.requires_grad, predictions.is_leaf)
            
        
        
        
        
        
        
        
        return predictions
                 
    
nn_model = Word2VecNegativeSamples(data.num_tokens())
nn_model.type(torch.FloatTensor)

Word2VecNegativeSamples(
  (input): Linear(in_features=19538, out_features=10, bias=False)
  (output): Linear(in_features=10, out_features=19538, bias=False)
)

In [135]:
print([param.grad for param in nn_model.input.parameters()])

[None]


In [107]:
[param for param in nn_model.input.parameters()]

[Parameter containing:
 tensor([[ 3.3271e-03,  2.5577e-04,  2.9504e-03,  ...,  1.4665e-03,
           3.4930e-03, -3.2840e-03],
         [ 2.3258e-03,  6.4329e-03, -2.5227e-03,  ...,  5.5142e-03,
          -1.6655e-03, -5.1021e-03],
         [ 3.9641e-03, -7.1280e-03,  4.8454e-03,  ..., -4.4003e-03,
           4.7619e-03, -1.1641e-03],
         ...,
         [-4.7400e-03,  5.0435e-03, -1.9362e-03,  ...,  5.9505e-03,
          -9.9611e-04, -3.6416e-03],
         [ 4.1198e-03,  6.2494e-03,  6.7986e-03,  ...,  1.6582e-03,
           4.8085e-03, -2.3875e-03],
         [-5.4196e-03, -2.9594e-03,  4.8497e-03,  ..., -5.3968e-03,
           2.2654e-03, -1.4168e-05]], requires_grad=True)]

In [106]:
[param for param in nn_model.parameters()][0]

Parameter containing:
tensor([[ 3.3271e-03,  2.5577e-04,  2.9504e-03,  ...,  1.4665e-03,
          3.4930e-03, -3.2840e-03],
        [ 2.3258e-03,  6.4329e-03, -2.5227e-03,  ...,  5.5142e-03,
         -1.6655e-03, -5.1021e-03],
        [ 3.9641e-03, -7.1280e-03,  4.8454e-03,  ..., -4.4003e-03,
          4.7619e-03, -1.1641e-03],
        ...,
        [-4.7400e-03,  5.0435e-03, -1.9362e-03,  ...,  5.9505e-03,
         -9.9611e-04, -3.6416e-03],
        [ 4.1198e-03,  6.2494e-03,  6.7986e-03,  ...,  1.6582e-03,
          4.8085e-03, -2.3875e-03],
        [-5.4196e-03, -2.9594e-03,  4.8497e-03,  ..., -5.3968e-03,
          2.2654e-03, -1.4168e-05]], requires_grad=True)

In [126]:
w1 =[(param*1) for param in nn_model.input.parameters()][0]
w1

tensor([[ 0.0041, -0.0050, -0.0018,  ..., -0.0038,  0.0002, -0.0071],
        [ 0.0012,  0.0040, -0.0033,  ..., -0.0005, -0.0047,  0.0017],
        [-0.0004, -0.0020, -0.0065,  ..., -0.0034, -0.0045, -0.0010],
        ...,
        [ 0.0014, -0.0013,  0.0063,  ...,  0.0027,  0.0009, -0.0031],
        [-0.0050,  0.0058, -0.0068,  ...,  0.0018, -0.0025, -0.0047],
        [-0.0033,  0.0009, -0.0057,  ..., -0.0042, -0.0026, -0.0016]],
       grad_fn=<MulBackward0>)

In [127]:
w1.is_leaf

False

In [128]:
w1.requires_grad

True

In [94]:
w1[0]

tensor([-0.0010,  0.0012,  0.0031,  ..., -0.0030,  0.0028, -0.0036],
       grad_fn=<SelectBackward>)

In [15]:
w1 = torch.t([param.data for param in nn_model.parameters()][0])
w1.requires_grad_()

tensor([[ 1.3624e-03,  5.8675e-03, -3.8278e-03,  ...,  2.5021e-03,
          5.1014e-03, -5.8364e-04],
        [ 1.3429e-03, -6.4820e-03,  6.3316e-03,  ...,  2.2729e-03,
          4.5435e-04, -1.6621e-03],
        [-4.2655e-03, -5.3550e-03,  7.1297e-04,  ..., -6.5988e-03,
          6.1779e-06,  5.6762e-03],
        ...,
        [-3.0516e-03, -6.2825e-03, -1.4608e-03,  ...,  1.2444e-03,
          5.8165e-03,  2.0040e-04],
        [ 4.0824e-03, -5.0410e-04,  2.2421e-03,  ..., -3.8134e-03,
         -3.4053e-03, -5.9445e-03],
        [-4.4921e-03, -4.7227e-03, -3.1155e-03,  ..., -4.8462e-03,
         -3.5674e-04, -2.1327e-03]], requires_grad=True)

In [22]:
w1.is_leaf_("False")

AttributeError: 'Tensor' object has no attribute 'is_leaf_'

In [20]:
w1.is_leaf

True

In [19]:
w1.retain_grad

<bound method Tensor.retain_grad of tensor([[ 1.3624e-03,  5.8675e-03, -3.8278e-03,  ...,  2.5021e-03,
          5.1014e-03, -5.8364e-04],
        [ 1.3429e-03, -6.4820e-03,  6.3316e-03,  ...,  2.2729e-03,
          4.5435e-04, -1.6621e-03],
        [-4.2655e-03, -5.3550e-03,  7.1297e-04,  ..., -6.5988e-03,
          6.1779e-06,  5.6762e-03],
        ...,
        [-3.0516e-03, -6.2825e-03, -1.4608e-03,  ...,  1.2444e-03,
          5.8165e-03,  2.0040e-04],
        [ 4.0824e-03, -5.0410e-04,  2.2421e-03,  ..., -3.8134e-03,
         -3.4053e-03, -5.9445e-03],
        [-4.4921e-03, -4.7227e-03, -3.1155e-03,  ..., -4.8462e-03,
         -3.5674e-04, -2.1327e-03]], requires_grad=True)>

In [98]:
gr = [param.grad for param in nn_model.input.parameters()]
gr  

[None]

In [287]:
m1 = w1[0].reshape(1, 10)
m1.requires_grad_()

tensor([[-0.0003,  0.0024, -0.0007, -0.0016, -0.0054, -0.0020,  0.0032, -0.0066,
          0.0017, -0.0051]], grad_fn=<ViewBackward>)

In [262]:
output_ind = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [32]:
[(param.data*1).is_leaf for param in nn_model.parameters()][0]

True

In [62]:
w2 = [param.data for param in nn_model.parameters()][1]


In [73]:
w2.is_leaf

True

In [77]:
w1.requires_grad_()

tensor([[ 1.3624e-03,  5.8675e-03, -3.8278e-03,  ...,  2.5021e-03,
          5.1014e-03, -5.8364e-04],
        [ 1.3429e-03, -6.4820e-03,  6.3316e-03,  ...,  2.2729e-03,
          4.5435e-04, -1.6621e-03],
        [-4.2655e-03, -5.3550e-03,  7.1297e-04,  ..., -6.5988e-03,
          6.1779e-06,  5.6762e-03],
        ...,
        [-3.0516e-03, -6.2825e-03, -1.4608e-03,  ...,  1.2444e-03,
          5.8165e-03,  2.0040e-04],
        [ 4.0824e-03, -5.0410e-04,  2.2421e-03,  ..., -3.8134e-03,
         -3.4053e-03, -5.9445e-03],
        [-4.4921e-03, -4.7227e-03, -3.1155e-03,  ..., -4.8462e-03,
         -3.5674e-04, -2.1327e-03]], requires_grad=True)

In [46]:
w2.requires_grad_()

tensor([[ 0.1110, -0.2169, -0.0652,  ...,  0.0942,  0.0219,  0.1264],
        [ 0.2457, -0.0487, -0.0121,  ...,  0.1763, -0.0029, -0.1987],
        [ 0.0368, -0.0141, -0.0812,  ...,  0.0100, -0.0989, -0.1545],
        ...,
        [ 0.1195,  0.0206, -0.0482,  ..., -0.2491,  0.2809, -0.1351],
        [ 0.0267, -0.1937,  0.2292,  ...,  0.0502,  0.1929, -0.2277],
        [ 0.2141,  0.1465, -0.2330,  ...,  0.0186, -0.2122,  0.2607]],
       requires_grad=True)

In [76]:
gr = [param for param in nn_model.input.parameters()]
gr    

[Parameter containing:
 tensor([[-0.0010,  0.0012,  0.0031,  ..., -0.0030,  0.0028, -0.0036],
         [ 0.0071, -0.0019, -0.0023,  ..., -0.0007,  0.0054,  0.0046],
         [-0.0023,  0.0058,  0.0054,  ...,  0.0067,  0.0019,  0.0042],
         ...,
         [-0.0049,  0.0022,  0.0042,  ...,  0.0026, -0.0062, -0.0048],
         [-0.0035, -0.0061,  0.0059,  ...,  0.0010, -0.0044, -0.0035],
         [-0.0066, -0.0056,  0.0057,  ...,  0.0058,  0.0051,  0.0066]],
        requires_grad=True)]

In [51]:
nn_model.input.parameters()

<generator object Module.parameters at 0x00000000056A25F0>

In [296]:
m2 = torch.t(w2[output_ind])
m2

tensor([[-0.1468,  0.1707,  0.1678,  0.0233, -0.1235, -0.1545, -0.0900, -0.2851,
          0.2585,  0.2899,  0.0768],
        [-0.2583,  0.2804, -0.1437,  0.0336, -0.2281,  0.2534,  0.0823,  0.0826,
          0.0594,  0.0964, -0.2174],
        [-0.1407, -0.0622,  0.0738,  0.2529, -0.2787,  0.0425,  0.2343,  0.1659,
          0.3078,  0.0120,  0.0084],
        [ 0.0897,  0.2059,  0.0433,  0.3120,  0.1637, -0.2694, -0.2415, -0.2841,
          0.2193,  0.1178,  0.1219],
        [ 0.0712, -0.2343, -0.2082,  0.1549,  0.1034,  0.2707,  0.2247,  0.2907,
         -0.3047, -0.1955,  0.2333],
        [-0.0445, -0.0904, -0.3132,  0.1190,  0.1084, -0.1875, -0.2409, -0.0674,
          0.2143, -0.1601, -0.1335],
        [ 0.1465, -0.2202,  0.3016,  0.1298,  0.2201,  0.0734,  0.0788,  0.3048,
         -0.0270,  0.1450, -0.0641],
        [ 0.1020, -0.2906,  0.2984, -0.2794,  0.1390, -0.1881,  0.1639, -0.1721,
         -0.2379, -0.0068,  0.2243],
        [ 0.2297,  0.2350, -0.0716, -0.1292,  0.1619,  0

In [290]:
pred = torch.matmul(m1, m2)
pred.requires_grad_()

tensor([[-8.7658e-04,  4.0723e-03, -9.3365e-04, -3.4686e-05, -1.2278e-03,
          3.5358e-03, -2.7866e-04,  1.8118e-03,  3.1768e-03,  9.8562e-04,
         -4.8958e-03]], grad_fn=<MmBackward>)

In [291]:
preds = torch.Tensor([]).float().requires_grad_()
preds.requires_grad_()

tensor([], requires_grad=True)

In [294]:
preds = torch.cat((preds, pred), 0)
preds.requires_grad_()

tensor([[-8.7658e-04,  4.0723e-03, -9.3365e-04, -3.4686e-05, -1.2278e-03,
          3.5358e-03, -2.7866e-04,  1.8118e-03,  3.1768e-03,  9.8562e-04,
         -4.8958e-03],
        [-8.7658e-04,  4.0723e-03, -9.3365e-04, -3.4686e-05, -1.2278e-03,
          3.5358e-03, -2.7866e-04,  1.8118e-03,  3.1768e-03,  9.8562e-04,
         -4.8958e-03]], grad_fn=<CatBackward>)

In [295]:
preds

tensor([[-8.7658e-04,  4.0723e-03, -9.3365e-04, -3.4686e-05, -1.2278e-03,
          3.5358e-03, -2.7866e-04,  1.8118e-03,  3.1768e-03,  9.8562e-04,
         -4.8958e-03],
        [-8.7658e-04,  4.0723e-03, -9.3365e-04, -3.4686e-05, -1.2278e-03,
          3.5358e-03, -2.7866e-04,  1.8118e-03,  3.1768e-03,  9.8562e-04,
         -4.8958e-03]], grad_fn=<CatBackward>)

In [239]:
output_indices_batch_shape_1 = 11
output_indices_batch_shape_0 = 3


In [None]:
w1 = torch.t([param.data for param in self.parameters()][0])
w2 = [param.data for param in self.parameters()][1]
for n_sample in range(output_indices_batch.shape[0]):
    
    m1 = w1[input_index_batch[n_sample]]
    output_ind = output_indices_batch[n_sample] 
    m2 = torch.t(w2[output_ind])
    pred = torch.matmul(m1,m2) 
    print(pred.shape)
    predictions = torch.cat((predictions, pred), 0)  

In [182]:
for n_sample in range(output_indices_batch_shape_0):
    m1 = torch.t([param.data for param in nn_model.parameters()][0])[n_sample]
    output_ind = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # output_indeces = output_indices_batch[n_sample]
    m2 = torch.t([param.data for param in nn_model.parameters()][1][output_ind])
    pred = torch.matmul(m1,m2)
    preds = torch.cat((preds, pred), 0)
    
    
    

In [183]:
preds

tensor([-0.0051, -0.0040, -0.0008, -0.0022, -0.0008,  0.0020,  0.0006,  0.0029,
        -0.0022,  0.0033, -0.0016, -0.0006,  0.0020,  0.0005, -0.0031, -0.0007,
         0.0005, -0.0006,  0.0044, -0.0015, -0.0002,  0.0011, -0.0072, -0.0026,
        -0.0010,  0.0012, -0.0032,  0.0008,  0.0031, -0.0007, -0.0023,  0.0043,
        -0.0015])

In [170]:
torch.cat((torch.Tensor([]).float(), pred), 0) 

tensor([-0.0072, -0.0026, -0.0010,  0.0012, -0.0032,  0.0008,  0.0031, -0.0007,
        -0.0023,  0.0043, -0.0015])

In [15]:
torch.t([param.data for param in nn_model.parameters()][1])[:, torch.tensor([0, 1, 3])].shape

torch.Size([10, 19538])

In [16]:
q = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
q

tensor([[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12],
        [13, 14, 15, 16]])

In [146]:
f = torch.tensor([[0, 1], [1, 2], [2, 3], [0, 1]])
f.type()

'torch.LongTensor'

In [None]:
f.type()

In [26]:
pr = torch.zeros([4, 2])
pr

tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]])

In [33]:
for row in range(4):
    pr[row] = q[row][f[row]]
pr    



tensor([[ 1.,  2.],
        [ 6.,  7.],
        [11., 12.],
        [13., 14.]])

In [171]:
[param for param in nn_model.input.parameters()][0].clone().detach()

tensor([[ 4.9088e-03, -6.4707e-04, -5.1057e-03,  ..., -2.3880e-03,
          4.3921e-03, -1.2205e-03],
        [-7.5033e-04, -1.4440e-03,  6.4289e-03,  ...,  2.0097e-04,
         -6.1794e-03, -1.9751e-03],
        [-4.3262e-03, -6.8158e-03, -3.4356e-03,  ..., -2.7210e-04,
         -2.1620e-03,  1.9391e-03],
        ...,
        [ 2.7808e-03,  3.4335e-03, -5.6131e-03,  ...,  4.6939e-06,
         -7.2159e-05,  5.1812e-03],
        [ 1.2654e-03, -2.8020e-03,  7.6238e-04,  ...,  3.4002e-03,
         -6.3169e-03, -3.6801e-03],
        [ 7.0225e-03, -5.1173e-03, -1.3122e-03,  ...,  2.3128e-03,
         -4.0132e-04,  7.1506e-03]])

In [173]:
[param for param in nn_model.input.parameters()][0].clone().detach()

tensor([[ 4.9088e-03, -6.4707e-04, -5.1057e-03,  ..., -2.3880e-03,
          4.3921e-03, -1.2205e-03],
        [-7.5033e-04, -1.4440e-03,  6.4289e-03,  ...,  2.0097e-04,
         -6.1794e-03, -1.9751e-03],
        [-4.3262e-03, -6.8158e-03, -3.4356e-03,  ..., -2.7210e-04,
         -2.1620e-03,  1.9391e-03],
        ...,
        [ 2.7808e-03,  3.4335e-03, -5.6131e-03,  ...,  4.6939e-06,
         -7.2159e-05,  5.1812e-03],
        [ 1.2654e-03, -2.8020e-03,  7.6238e-04,  ...,  3.4002e-03,
         -6.3169e-03, -3.6801e-03],
        [ 7.0225e-03, -5.1173e-03, -1.3122e-03,  ...,  2.3128e-03,
         -4.0132e-04,  7.1506e-03]])

In [9]:
def extract_word_vectors(nn_model):
    '''
    Extracts word vectors from the model
    
    Returns:
    input_vectors: torch.Tensor with dimensions (num_tokens, num_dimensions)
    output_vectors: torch.Tensor with dimensions (num_tokens, num_dimensions)
    '''
    # TODO: Implement extracting word vectors from param weights
    # return tuple of input vectors and output vectos
    input_vectors = torch.t([param.data for param in nn_model.parameters()][0].clone().detach())
    output_vectors = [param.data for param in nn_model.parameters()][1].clone().detach()
    return input_vectors, output_vectors

untrained_input_vectors, untrained_output_vectors = extract_word_vectors(nn_model)
assert untrained_input_vectors.shape == (data.num_tokens(), wordvec_dim)
assert untrained_output_vectors.shape == (data.num_tokens(), wordvec_dim)

In [215]:
def train_neg_sample(model, dataset, train_loader, optimizer, scheduler, num_epochs):    
    '''
    Trains word2vec with negative samples on and regenerating dataset every epoch
    
    Returns:
    loss_history, train_history
    '''
    loss = nn.BCEWithLogitsLoss().type(torch.FloatTensor)
    loss_history = []
    train_history = []
    for epoch in range(num_epochs):
        model.train() # Enter train mode
        
        dataset.generate_dataset()
        
        # TODO: Implement training using negative samples
        # You can estimate accuracy by comparing prediction values with 0
        # And don't forget to step the scheduler!
        loss_accum = 0
        correct_samples = 0
        total_samples = 0
        for i_step, (input_index_batch, output_indices_batch, output_target_batch) in enumerate(train_loader):
            prediction = model(input_index_batch, output_indices_batch)    
            loss_value = loss(prediction, output_target_batch.float())
            optimizer.zero_grad()
            loss_value.backward()
            
#             if i_step == 0:
            w_1=[param.data for param in nn_model.input.parameters()][0].clone().detach()
            print(w_1)
#                 
            
            optimizer.step()
#             if i_step == 0:
            w_2=[param.data for param in nn_model.input.parameters()][0].clone().detach()
            print(w_2)
            print(torch.equal(w_1, w_2))
            
        
            
            r_c = prediction[:, 0]>=0
            w_c = prediction[:, 1:]<0
            mask = torch.cat((r_c.reshape(input_index_batch.shape[0],1), w_c), 1)
            correct_samples += int(torch.sum(torch.sum(mask, 1) == mask.shape[1]))
            total_samples += mask.shape[0]
            loss_accum += loss_value

        
            if i_step == 3:
                break
        if epoch == 0:
            break
        
        
        ave_loss = loss_accum / i_step
        train_accuracy = float(correct_samples) / total_samples
        
        
        loss_history.append(float(ave_loss))
        train_history.append(train_accuracy)
        scheduler.step()      
        
        
        
        
        
        
        print("Average loss: %f, Train accuracy: %f" % (ave_loss, train_accuracy))
        
    return loss_history, train_history

In [70]:
len(dataset)

165774

In [217]:
w_1 = [param.data for param in nn_model.input.parameters()][0].clone().detach()

In [218]:
print(w_1)

tensor([[ 0.0050, -0.0017,  0.0066,  ..., -0.0041, -0.0013,  0.0042],
        [ 0.0059,  0.0061,  0.0060,  ..., -0.0035, -0.0029, -0.0054],
        [-0.0053,  0.0063, -0.0012,  ...,  0.0059, -0.0015,  0.0068],
        ...,
        [-0.0026,  0.0002,  0.0058,  ...,  0.0068, -0.0034,  0.0037],
        [ 0.0019,  0.0013, -0.0024,  ..., -0.0028, -0.0056, -0.0065],
        [ 0.0030,  0.0069, -0.0020,  ...,  0.0006, -0.0040,  0.0034]])


In [219]:
[param.data for param in nn_model.input.parameters()][0]

tensor([[ 0.0050, -0.0017,  0.0066,  ..., -0.0041, -0.0013,  0.0042],
        [ 0.0059,  0.0061,  0.0060,  ..., -0.0035, -0.0029, -0.0054],
        [-0.0053,  0.0063, -0.0012,  ...,  0.0059, -0.0015,  0.0068],
        ...,
        [-0.0026,  0.0002,  0.0058,  ...,  0.0068, -0.0034,  0.0037],
        [ 0.0019,  0.0013, -0.0024,  ..., -0.0028, -0.0056, -0.0065],
        [ 0.0030,  0.0069, -0.0020,  ...,  0.0006, -0.0040,  0.0034]])

# Ну и наконец тренировка!

Добейтесь значения ошибки меньше **0.25**.

In [220]:
# Finally, let's train the model!

# TODO: We use placeholder values for hyperparameters - you will need to find better values!
optimizer = optim.SGD(nn_model.parameters(), lr=2.4, weight_decay=0.000016)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=20)



In [221]:
loss_history, train_history = train_neg_sample(nn_model, dataset, train_loader, optimizer, scheduler, 7)

tensor([[ 0.0050, -0.0017,  0.0066,  ..., -0.0041, -0.0013,  0.0042],
        [ 0.0059,  0.0061,  0.0060,  ..., -0.0035, -0.0029, -0.0054],
        [-0.0053,  0.0063, -0.0012,  ...,  0.0059, -0.0015,  0.0068],
        ...,
        [-0.0026,  0.0002,  0.0058,  ...,  0.0068, -0.0034,  0.0037],
        [ 0.0019,  0.0013, -0.0024,  ..., -0.0028, -0.0056, -0.0065],
        [ 0.0030,  0.0069, -0.0020,  ...,  0.0006, -0.0040,  0.0034]])
tensor([[ 0.0050, -0.0017,  0.0066,  ..., -0.0041, -0.0013,  0.0042],
        [ 0.0059,  0.0061,  0.0060,  ..., -0.0035, -0.0029, -0.0054],
        [-0.0053,  0.0063, -0.0012,  ...,  0.0059, -0.0015,  0.0068],
        ...,
        [-0.0026,  0.0002,  0.0058,  ...,  0.0068, -0.0034,  0.0037],
        [ 0.0019,  0.0013, -0.0024,  ..., -0.0028, -0.0056, -0.0065],
        [ 0.0030,  0.0069, -0.0020,  ...,  0.0006, -0.0040,  0.0034]])
False
tensor([[ 0.0050, -0.0017,  0.0066,  ..., -0.0041, -0.0013,  0.0042],
        [ 0.0059,  0.0061,  0.0060,  ..., -0.0035, -0.00

In [None]:
## Visualize training graphs
plt.subplot(211)
plt.plot(train_history)
plt.subplot(212)
plt.plot(loss_history)

# Визуализируем вектора для разного вида слов до и после тренировки

Как и ранее, в случае успешной тренировки вы должны увидеть как вектора слов разных типов (например, знаков препинания, предлогов и остальных)

In [None]:
trained_input_vectors, trained_output_vectors = extract_word_vectors(nn_model)
assert trained_input_vectors.shape == (data.num_tokens(), wordvec_dim)
assert trained_output_vectors.shape == (data.num_tokens(), wordvec_dim)

def visualize_vectors(input_vectors, output_vectors, title=''):
    full_vectors = torch.cat((input_vectors, output_vectors), 0)
    wordvec_embedding = PCA(n_components=2).fit_transform(full_vectors)

    # Helpful words form CS244D example
    # http://cs224d.stanford.edu/assignment1/index.html
    visualize_words = {'green': ["the", "a", "an"], 
                      'blue': [",", ".", "?", "!", "``", "''", "--"], 
                      'brown': ["good", "great", "cool", "brilliant", "wonderful", 
                              "well", "amazing", "worth", "sweet", "enjoyable"],
                      'orange': ["boring", "bad", "waste", "dumb", "annoying", "stupid"],
                      'red': ['tell', 'told', 'said', 'say', 'says', 'tells', 'goes', 'go', 'went']
                     }

    plt.figure(figsize=(7,7))
    plt.suptitle(title)
    for color, words in visualize_words.items():
        points = np.array([wordvec_embedding[data.index_by_token[w]] for w in words])
        for i, word in enumerate(words):
            plt.text(points[i, 0], points[i, 1], word, color=color,horizontalalignment='center')
        plt.scatter(points[:, 0], points[:, 1], c=color, alpha=0.3, s=0.5)

visualize_vectors(untrained_input_vectors, untrained_output_vectors, "Untrained word vectors")
visualize_vectors(trained_input_vectors, trained_output_vectors, "Trained word vectors")