# Project: Detect Similar Words Using the Word2Vec Technique - NLP Word Embeddings

### Goal: find top 15 words that are similar to "two", "america", and "computer", respectively.

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud

from collections import Counter
import numpy as np
import random

import scipy
from sklearn.metrics.pairwise import cosine_similarity

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)

<torch._C.Generator at 0x7f96f639f390>

In [23]:
M = 3                # context window
K = 15               # number of negative samples
epochs = 2
MAX_VOCAB_SIZE = 10000
EMBEDDING_SIZE = 100  
batch_size = 32
lr = 0.2

In [25]:
# open the file and read the texts
with open('/Users/wushuangyan/Desktop/STAT classes/Data Science/text8.train.txt') as f:
    text = f.read()

text = text.lower().split()                                         # simple pre-processing
vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1))    # extract a vocabulary, each vocabulary item is (word, word_index)
vocab_dict['<UNK>'] = len(text) - np.sum(list(vocab_dict.values())) # encode the uncommon terms as "<UNK>"

word2idx = {word:i for i, word in enumerate(vocab_dict.keys())}     # index
idx2word = {i:word for i, word in enumerate(vocab_dict.keys())}
word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)                      # freq or probability
word_freqs = word_freqs ** (3./4.)    

In [None]:
class WordEmbeddingDataset(tud.Dataset):
    def __init__(self, text, word2idx, word_freqs):
        ''' text: a list of words, all text from the training dataset
            word2idx: the dictionary from word to index
            word_freqs: the frequency of each word
        '''
        super(WordEmbeddingDataset, self).__init__()                                    # next line: unknown works are given index of UNK's index
        self.text_encoded = [word2idx.get(word, word2idx['<UNK>']) for word in text]    # map each word to its index
        self.text_encoded = torch.LongTensor(self.text_encoded)                         # turn index to pytorch LongTensor type
        self.word2idx = word2idx
        self.word_freqs = torch.Tensor(word_freqs)
        

    def __len__(self):
        return len(self.text_encoded)                                                 # return the total number of words in the texts
    

    def __getitem__(self, idx):
        ''' Prepare training batches. Each mini-batch contains the following:
            - idx: the index of the center word
            - Get the neighbor words of the center word
            - Randomly sample K words as negative sampling
        '''
        center_words = self.text_encoded[idx]                                         # center word by index
        pos_indices = list(range(idx - M, idx)) + list(range(idx + 1, idx + M + 1))   # neighbor words
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]               # avoid index going beyond 0 or vocabulary size
        pos_words = self.text_encoded[pos_indices]                                    # index for context words
        
        neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)  # 15 * 6 = 90, sampling freq for each context, negative sampling
                                                                                      # show the position index
                                                                                      # neg_words should not contain neighbor words
        while len(set(pos_indices) & set(neg_words)) > 0:                             # pos =6 # neg =90, if contain, resampling
            neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)

        return center_words, pos_words, neg_words

In [None]:
dataset = WordEmbeddingDataset(text, word2idx, word_freqs)
dataloader = tud.DataLoader(dataset, batch_size, shuffle=True)

In [None]:
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(EmbeddingModel, self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)
        
    def forward(self, input_labels, pos_labels, neg_labels):  # pos: Neignbor, neg: negative sampling 
        input_embedding = self.in_embed(input_labels)         # size:   [batch_size, EMBEDDING_SIZE]                            
        pos_embedding = self.out_embed(pos_labels)            # size:   [batch_size, 2 * M * EMBEDDING_SIZE]
        neg_embedding = self.out_embed(neg_labels)            # size:   [batch_size, 2 * M * K * EMBEDDING_SIZE]
        
        input_embedding = input_embedding.unsqueeze(2)        # size:   [batch_size, EMBEDDING_SIZE * 1]
        
        pos_dot = torch.bmm(pos_embedding, input_embedding)   # size:   [batch_size, 2*M * 1]
        pos_dot = pos_dot.squeeze(2)                          # size:   [batch_size, 2*M]
        
        neg_dot = torch.bmm(neg_embedding, -input_embedding)  # size:   [batch_size, 2 * M * K * 1]
        neg_dot = neg_dot.squeeze(2)                          # size:   [batch_size, 2 * M * K]
        
        log_pos = F.logsigmoid(pos_dot).sum(1)                # size:    batch_size
        log_neg = F.logsigmoid(neg_dot).sum(1)              
        
        loss = log_pos + log_neg
        
        return -loss
    
    def input_embedding(self):
        return self.in_embed.weight.detach().numpy()

model = EmbeddingModel(MAX_VOCAB_SIZE, EMBEDDING_SIZE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
for e in range(1):
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        input_labels = input_labels.long()
        pos_labels = pos_labels.long()
        neg_labels = neg_labels.long()

        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean()
        loss.backward()

        optimizer.step()

        if i % 200 == 0:
            print('epoch', e, 'iteration', i, loss.item())

        if i % 1000 == 0:
            embedding_weights = model.input_embedding()

embedding_weights = model.input_embedding()
torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE))

In [None]:
def find_nearest(word):
    index = word2idx[word]
    embedding = embedding_weights[index]
    cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
    return [idx2word[i] for i in cos_dis.argsort()[:15]]

for word in ["two", "america", "computer"]:
    print(word, find_nearest(word))

two ['two', 'three', 'four', 'five', 'six', 'zero', 'one', 'seven', 'eight', 'nine', 'april', 'january', 'march', 'm', 'october']
america ['america', 'europe', 'africa', 'asia', 'australia', 'south', 'north', 'western', 'african', 'india', 'southern', 'zealand', 'china', 'central', 'canada']
computer ['computer', 'software', 'hardware', 'design', 'digital', 'computers', 'video', 'program', 'electronic', 'programs', 'technology', 'basic', 'programming', 'advanced', 'engineering']
