## CS310 Natural Language Processing
## Lab 4 (part 2): Data preparation for implementing word2vec

skipgram architecture and negative sampling method

In [69]:
from typing import List
from pprint import pprint
from utils import CorpusReader
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [70]:
# We set min_count=1 to include all words in the corpus

corpus = CorpusReader(inputFileName="lunyu_20chapters.txt", min_count=1)

Total vocabulary: 1352


In [71]:
print(corpus.word2id["子"])
print(corpus.id2word[1])
print(len(corpus.id2word))

1
子
1352


### Efficient way for negative sampling

In `utils.CorpusReader` class, we have implemented a method `initTableNegatives`. It creates a list of words (`self.negatives`) with a size of 1e8. This size is set a large value so that it scales up to very large corpus. 

The list contains the index of each word in the vocabulary, whose probability is proportional to the power of 0.75 of the word's original frequency count. 



In [72]:
# This is a simulation of how initTableNegatives works
# The impl. in utils.py is a bit different, but the idea is the same
word_frequency = {"a": 1, "b": 2, "c": 3, "d": 4}

# the scaled sum of frequencies Z = 1**0.75 + 2**0.75 + 3**0.75 + 4**0.75 = 7.7897270
# then the scaled probability of a = 1**0.75 / Z = 0.12837420128374202
# the scaled probability of b = 2**0.75 / Z = 0.21589881215898812
# the scaled probability of c = 3**0.75 / Z = 0.29262990292629903
# the scaled probability of d = 4**0.75 / Z = 0.3630970836309708

def initTableNegatives():
    pow_frequency = np.array(list(word_frequency.values())) ** 0.75
    words_pow = sum(pow_frequency)
    ratio = pow_frequency / words_pow
    count = np.round(ratio * CorpusReader.NEGATIVE_TABLE_SIZE)
    negatives = []
    for wid, c in enumerate(count):
        negatives += [wid] * int(c)
    negatives = np.array(negatives)
    np.random.shuffle(negatives)
    return negatives

negatives = initTableNegatives()

In [73]:
print(len(negatives))
print(set(negatives)) # the word indices: a -> 0, b -> 1, c -> 2, d -> 3
print(np.sum(negatives == 0) / len(negatives)) # should be the scaled probability of a
print(np.sum(negatives == 1) / len(negatives)) # should be the scaled probability of b
print(np.sum(negatives == 2) / len(negatives)) # should be the scaled probability of c
print(np.sum(negatives == 3) / len(negatives)) # should be the scaled probability of d

99999999
{0, 1, 2, 3}
0.12837420128374202
0.21589881215898812
0.29262990292629903
0.3630970836309708


Next, the `getNegatives` method returns the negative samples for a target word. The idea is to chop off a segment of given `size` from the `negatives` list. 

If the segment contains the target word, it is discarded and a new segment is taken. This is done to avoid the target word itself to be sampled as a negative.

In [74]:
# Test some examples

corpus.getNegatives(target=1, size=5)

array([ 45, 210,  93,  27, 218])

### Generate data for training

Now we are going to implement the sliding window to generate center, outside, and negative words for each position in a sentence.

- It takes a list of words as input and go through each word as a center word.
- For each center word, both the left and right `window_size` words are considered as outside words. This number is smaller near the two ends of the sentence.
- Call `corpus.getNegatives` to get negative samples for each center word.

In [75]:
def generate_data(words: List[str], window_size: int, k: int, corpus: CorpusReader):
    """ Generate the training data for word2vec skip-gram model
    Args:
        text: the input text
        window_size: the size of the context window
        k: the number of negative samples
        corpus: the corpus object, providing utilities such as word2id, getNegatives, etc.
    """
    ### YOUR CODE HERE ###
    word_ids = [] # convert the list of words to a list of word ids
    # Use for loop and yield
    word_ids = [corpus.word2id[word] for word in words]  # Convert the list of words to a list of word ids

    for center_index, center_id in enumerate(word_ids):
        context_indices = None
        # Iterate over the left context words
        for i in range(max(0, center_index - window_size), center_index):
            context_indices=word_ids[i]
            negative_samples = corpus.getNegatives(center_id, k)
            yield center_id, context_indices, negative_samples

        # Iterate over the right context words
        for i in range(center_index + 1, min(center_index + window_size + 1, len(word_ids))):
            context_indices=word_ids[i]
            negative_samples = corpus.getNegatives(center_id, k)
            yield center_id, context_indices, negative_samples

    ### END YOUR CODE ###

In [76]:
# Test generate_data
text = "学而时习之"
words = list(text)
print('words:', words)
print('word ids:', [corpus.word2id[word] for word in words])

# first center word is 学
print()
print(f'When window size is 3, for center word 学 -> {corpus.word2id["学"]}')
print(f'the outside words are: ')
print(f'而 -> {corpus.word2id["而"]}')
print(f'时 -> {corpus.word2id["时"]}')
print(f'习 -> {corpus.word2id["习"]}')

print()
print('output from generate_data:')
data = list(generate_data(list(text), window_size=3, k=5, corpus=corpus))
print(data[:3])


### You are expected to see the following output:
### Note that the negative samples are random, so you may see different numbers
# words: ['学', '而', '时', '习', '之']
# word ids: [46, 8, 224, 544, 5]

# When window size is 3, for center word 学 -> 46
# the outside words are: 
# 而 -> 8
# 时 -> 224
# 习 -> 544

# output from generate_data:
# [(46, 8, array([354,   3, 831, 570,  27])),
#  (46, 224, array([1077, 1095,   89,  340,   92])),
#  (46, 544, array([ 49, 488,   4, 269,  30]))]

words: ['学', '而', '时', '习', '之']
word ids: [46, 8, 224, 544, 5]

When window size is 3, for center word 学 -> 46
the outside words are: 
而 -> 8
时 -> 224
习 -> 544

output from generate_data:
[(46, 8, array([294,  29, 665, 326, 510])),
 (46, 224, array([ 94, 712,  73, 374, 276])),
 (46, 544, array([ 33, 842,   0, 619, 123]))]


However, the above data are not in batch. We want all center words are batched into a tensor of dimension `batch_size`; same for the outside words and negative samples.

For example, in "学而时习之", if `batch_size` is 4, then the returned batch[0] will contain three tensors. 
- The first tensor contains center words, i.e., 3 "学" plus 1 "而" => [46, 46, 46, 8]
- The second tensor contains the correponding outside words, i.e., "而", "时", and "习" for "学"; "学" for "而" => [8, 224, 544,  46]
- The third tensor contains the negative samples, whose dimension is `batch_size` $\times$ `k`
  
The data type of the tensors is `torch.long`.

In [77]:
def batchify(data: List, batch_size: int):
    """ Group a stream into batches and yield them as torch tensors.
    Args:
        data: a list of tuples
        batch_size: the batch size 
    Yields:
        a tuple of three torch tensors: center, outside, negative
    """
    assert batch_size < len(data) # data should be long enough
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        if i > len(data) - batch_size: # if the last batch is smaller than batch_size, pad it with the first few data
            batch = batch + data[:i + batch_size - len(data)]
        
        ### YOUR CODE HERE ###
        center_batch = []
        outside_batch = []
        negative_batch = []

        for center, outside, negative in batch:
            center_batch.append(center)
            outside_batch.append(outside)
            negative_batch.append(negative)

        center_tensor = torch.tensor(center_batch, dtype=torch.long)
        outside_tensor = torch.tensor(outside_batch, dtype=torch.long)
        negative_tensor = torch.tensor(negative_batch, dtype=torch.long)

        yield center_tensor, outside_tensor, negative_tensor

        ### END YOUR CODE ###


### You are expected to see the following output:
### Note that the negative samples are random, so you may see different numbers
# (tensor([46, 46, 46,  8]), tensor([  8, 224, 544,  46]), tensor([[  85,    3,   72,   26,   35],
#         [   7,    1,  487,   20,    4],
#         [  12,  227,    2,   25,  639],
#         [ 582,  148,   15, 1203,   85]]))

In [78]:
# Test batchify

text = "学而时习之"
words = list(text)
data = list(generate_data(words, window_size=3, k=5, corpus=corpus))

batches = list(batchify(data, batch_size=4))
print(batches[0])

(tensor([46, 46, 46,  8]), tensor([  8, 224, 544,  46]), tensor([[  89,  181,   19,   40, 1219],
        [ 505,    1,   70,  228,   35],
        [ 485,  382,   56,   78,    1],
        [   1,   51,  396,  208,  521]]))


### Implement the SkipGram class

`SkipGram` is a subclass of `nn.Module`. The two key components are:
- `__init__`: initialize the embeddings
  - Two `nn.Embedding` objects are created: `self.emb_v` for center words; `self.emb_u` for outside words and negative samples.
  - Each `nn.Embedding` is created with `vocab_size` and `emb_dim` as input arguments. 
  - `self.emb_v` is initialized with uniform distribution; `self.emb_u` is initialized with zeros.
- `forward`: given input tensors, return the loss of the model
  - Takes three tensors as input: center words, outside words, and negative samples. They are the output from the previously defined `batchify` function.
  - Compute the loss using the formula: $-\log\sigma(v_c \cdot u_o) - \sum_{k=1}^K \log\sigma(-v_c \cdot u_k)$

*Hint*:
- For the $\log\sigma$ function, you can use `F.logsigmoid` in PyTorch. See the imported module: `import torch.nn.functional as F`
- If the input to `F.logsigmoid` is too large, it will return 0, which is not good for training. You can use `torch.clamp` to limit the input to a certain range. For example, `torch.clamp(x, min=-10, max=10)` will limit the input to be in the range of $[-10, 10]$.

In [79]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(SkipGram, self).__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.emb_v = nn.Embedding(vocab_size, emb_size, sparse=True)
        self.emb_u = nn.Embedding(vocab_size, emb_size, sparse=True)

        initrange = 1.0 / self.emb_size # some experience passed down from generation to generation
        nn.init.uniform_(self.emb_v.weight.data, -initrange, initrange) # same outcome as self.emb_v.weight.data.uniform_(-initrange, initrange)
        nn.init.constant_(self.emb_u.weight.data, 0) # same outcome as self.emb_u.weight.data.zero_()

    def forward(self, center, outside, negative):
        """
        Args:
            center: the center word indices (B, )
            outside: the outside word indices (B, )
            negative: the negative word indices (B, k)
        """
        v_c = self.emb_v(center)
        u_o = self.emb_u(outside)
        u_n = self.emb_u(negative)
        #
        # ### YOUR CODE HERE ###
        # Positive sample score
        pos_score = torch.sum(torch.mul(v_c, u_o), dim=1)  # (B,)
        pos_loss = F.logsigmoid(torch.clamp(pos_score, min=-10, max=10))  # (B,)

        # Negative sample scores
        neg_score = torch.bmm(u_n, v_c.unsqueeze(2)).squeeze(2)  # (B, k)
        neg_loss = F.logsigmoid(torch.clamp(-neg_score, min=-10, max=10))  # (B, k)

        # Combine losses
        loss = -torch.sum(pos_loss + torch.sum(neg_loss, dim=1))  # Scalar

        # ### END YOUR CODE ###

        return loss

    
    def save_embedding(self, id2word, file_name):
        embedding = self.emb_v.weight.cpu().data.numpy()
        with open(file_name, 'w') as f:
            f.write('%d %d\n' % (len(id2word), self.emb_size))
            for wid, w in id2word.items():
                e = ' '.join(map(lambda x: str(x), embedding[wid]))
                f.write('%s %s\n' % (w, e))

In [80]:
# Test the model
vacob_size = len(corpus.id2word)
emb_size = 32
model = SkipGram(vacob_size, emb_size)

weight = torch.empty(vacob_size, emb_size)
start_value = 0.01
for i in range(vacob_size):
    weight[i] = start_value + i * 0.01

with torch.no_grad():
    model.emb_v.weight.copy_(weight)
    model.emb_u.weight.copy_(weight)

    for batch in batches:
        loss = model(batch[0], batch[1], batch[2])
        print(loss)

### You are expected to see the following output:
### Note that the negative samples are random, so you may see different numbers
# tensor([25.4124, 36.0555, 31.7922, 22.9003])
# tensor([30.0502, 14.1560, 26.8572, 45.7634])
# tensor([45.7649, 50.0003, 50.0134, 50.0003])
# tensor([45.2376, 40.6255, 41.9053,  9.2085])
# tensor([34.3999, 26.6441, 25.4124, 36.0555])

tensor(143.8297)
tensor(92.2233)
tensor(180.2658)
tensor(158.9765)
tensor(137.2470)


In [81]:
# Test the loss
v_c = torch.FloatTensor([[0.1, 0.2, 0.3, 0.4, 0.5]])  
u_o = torch.FloatTensor([[0.5, 0.4, 0.3, 0.2, 0.1]])  
u_n = torch.FloatTensor([[[0.2, 0.2, 0.2, 0.2, 0.2], 
                          [0.3, 0.3, 0.3, 0.3, 0.3]]])

### YOUR CODE HERE ###score_o = torch.sum(torch.mul(v_c, u_o), dim=1)
# loss = None

# Positive sample score
pos_score = torch.sum(torch.mul(v_c, u_o), dim=1)  # (B,)
pos_loss = F.logsigmoid(torch.clamp(pos_score, min=-10, max=10))  # (B,)

# Negative sample scores
neg_score = torch.bmm(u_n, v_c.unsqueeze(2)).squeeze(2)  # (B, k)
neg_loss = F.logsigmoid(torch.clamp(-neg_score, min=-10, max=10))  # (B, k)

# Combine losses
loss = -torch.sum(pos_loss + torch.sum(neg_loss, dim=1))  # Scalar

print(f"Loss: {loss.item()}")

### You are expected to see the following output:
# Loss: 2.330986261367798

Loss: 2.330986499786377
