# Word2Vec (Negative Sampling)

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from string import punctuation
import time

In [2]:
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
from collections import Counter
import matplotlib
nltk.download('stopwords')
nltk.download('brown')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\swara\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

## 1. Load data

In [3]:
corpus = brown.sents()

In [4]:
stop_words = set(stopwords.words('english'))
corpus = [[word for word in sent if word.lower() not in stop_words] for sent in corpus]

# Remove punctuation from corpus
corpus = [[word for word in sent if word not in punctuation] for sent in corpus]

# Remove empty sentences
corpus = [sent for sent in corpus if len(sent) > 0]

# Remove sentences with less than 5 words
corpus = [sent for sent in corpus if len(sent) >= 5]

# Remove sentences with more than 20 words
corpus = [sent for sent in corpus if len(sent) <= 20]

# Remove rare words
word_freq = Counter([word for sent in corpus for word in sent])
corpus = [[word for word in sent if word_freq[word] > 5] for sent in corpus]

In [5]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [6]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['dog']

9954

In [7]:
vocabs.append('<UNK>')
word2index['<UNK>'] = len(vocabs)

In [8]:
index2word = {v:k for k, v in word2index.items()}
index2word[5]

'twenty'

## 2. Prepare train data

In [9]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, window_size=2):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(window_size, len(doc) - window_size):
            #center word
            center = word2index[doc[i]]
            #outside words = rest of the words
            outside_start =  i - window_size
            outside_end =  i + window_size + 1

            for j in range(outside_start, outside_end):
                if i != j:  # Skip the center word
                    outside = word2index[doc[j]]
                    skipgrams.append([center, outside])
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, corpus)

In [10]:
x.shape  #batch_size, 1

(2, 1)

In [11]:
x

array([[1754],
       [8987]])

In [12]:
y.shape  #batch_size 1

(2, 1)

## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [13]:
z = 0.001

In [14]:
#count
from collections import Counter

word_count = Counter(flatten(corpus))
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

356098

In [15]:
vocabs

['startled',
 'brow',
 'optimistic',
 'sentence',
 'peering',
 'twenty',
 'winding',
 'five',
 'thickness',
 'numbered',
 'Shu',
 'reliable',
 'structure',
 'bidding',
 'atmospheric',
 'midnight',
 'talk',
 'full-time',
 'separation',
 'lead',
 'attached',
 'explode',
 'Seeds',
 'Howe',
 'Elec',
 'interest',
 'dedicated',
 'whites',
 'Gene',
 'Thank',
 'hatred',
 'recognition',
 'convey',
 'doubted',
 'eighth',
 'countries',
 'hollow',
 'linked',
 'early',
 'wiser',
 'railroads',
 'suggested',
 'burning',
 'native',
 'poetic',
 'leaf',
 'sympathy',
 'candidate',
 'detective',
 'phosphate',
 'Italians',
 'cycle',
 'agony',
 'capillary',
 'orthodontist',
 'Fosdick',
 'explains',
 'whisper',
 'Marshal',
 'increasing',
 'judged',
 'liquid',
 'Similarly',
 'characterization',
 'town',
 '$300',
 'exhaust',
 'enter',
 'viewing',
 'patent',
 'Supper',
 'Manhattan',
 'avoiding',
 'multiplicity',
 'Two',
 'constantly',
 'closing',
 'Anthony',
 'swell',
 'deck',
 'screws',
 'nerve',
 'collage',
 

$$P(w)=U(w)^{3/4}/Z$$

In [16]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({'``': 45,
         "''": 45,
         '--': 22,
         'one': 22,
         'would': 20,
         'said': 17,
         'could': 14,
         'time': 14,
         'may': 12,
         'two': 12,
         'like': 11,
         'man': 11,
         'first': 11,
         'also': 10,
         'Af': 10,
         'must': 10,
         'new': 10,
         'made': 10,
         'back': 10,
         'even': 9,
         'much': 9,
         'way': 9,
         'years': 9,
         'many': 9,
         'see': 8,
         'Mr.': 8,
         'well': 8,
         'little': 8,
         'people': 8,
         'good': 8,
         'get': 8,
         'work': 8,
         'make': 8,
         'still': 8,
         'last': 7,
         'us': 7,
         'life': 7,
         'never': 7,
         'might': 7,
         'long': 7,
         'came': 7,
         'world': 7,
         'know': 7,
         'used': 7,
         'year': 7,
         'men': 7,
         'place': 6,
         'home': 6,
         'went': 6,
        

## 4. Model

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [17]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return torch.LongTensor(idxs)

In [18]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [19]:
batch_size = 2
x, y = random_batch(batch_size, corpus)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)

In [20]:
k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [21]:
y_tensor[1]

tensor([8603])

In [22]:
neg_samples[1]

tensor([ 4639,  9434, 10296,  7102,  7638])

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [23]:
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

In [24]:
#test your model
emb_size = 2
voc_size = len(vocabs)
model = SkipgramNeg(voc_size, emb_size)

In [25]:
loss = model(x_tensor, y_tensor, neg_samples)

In [26]:
loss

tensor(0.8802, grad_fn=<NegBackward0>)

## 5. Training

In [27]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [28]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
start = time.time()
num_epochs = 1000
window_size = 5

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus, window_size)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss = model(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()

    epoch_mins, epoch_secs = epoch_time(start, time.time())
    
    #print the loss
    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | Loss: {loss:.6f} | Time: {epoch_mins}m {epoch_secs}s")

Epoch: 100 | Loss: 0.878421 | Time: 1m 25s
Epoch: 200 | Loss: 2.960189 | Time: 2m 42s
Epoch: 300 | Loss: 2.288852 | Time: 4m 1s
Epoch: 400 | Loss: 3.368248 | Time: 5m 21s
Epoch: 500 | Loss: 7.616087 | Time: 6m 42s
Epoch: 600 | Loss: 0.342476 | Time: 8m 2s
Epoch: 700 | Loss: 1.514150 | Time: 9m 22s
Epoch: 800 | Loss: 1.494411 | Time: 10m 38s
Epoch: 900 | Loss: 1.516802 | Time: 11m 54s
Epoch: 1000 | Loss: 1.920947 | Time: 13m 8s


## 6. Plot the embeddings

In [30]:
vocabs

['startled',
 'brow',
 'optimistic',
 'sentence',
 'peering',
 'twenty',
 'winding',
 'five',
 'thickness',
 'numbered',
 'Shu',
 'reliable',
 'structure',
 'bidding',
 'atmospheric',
 'midnight',
 'talk',
 'full-time',
 'separation',
 'lead',
 'attached',
 'explode',
 'Seeds',
 'Howe',
 'Elec',
 'interest',
 'dedicated',
 'whites',
 'Gene',
 'Thank',
 'hatred',
 'recognition',
 'convey',
 'doubted',
 'eighth',
 'countries',
 'hollow',
 'linked',
 'early',
 'wiser',
 'railroads',
 'suggested',
 'burning',
 'native',
 'poetic',
 'leaf',
 'sympathy',
 'candidate',
 'detective',
 'phosphate',
 'Italians',
 'cycle',
 'agony',
 'capillary',
 'orthodontist',
 'Fosdick',
 'explains',
 'whisper',
 'Marshal',
 'increasing',
 'judged',
 'liquid',
 'Similarly',
 'characterization',
 'town',
 '$300',
 'exhaust',
 'enter',
 'viewing',
 'patent',
 'Supper',
 'Manhattan',
 'avoiding',
 'multiplicity',
 'Two',
 'constantly',
 'closing',
 'Anthony',
 'swell',
 'deck',
 'screws',
 'nerve',
 'collage',
 

In [31]:
fish = torch.LongTensor([word2index['fish']])
fish

tensor([8214])

In [32]:
fish_embed_c = model.embedding_center(fish)
fish_embed_o = model.embedding_outside(fish)
fish_embed   = (fish_embed_c + fish_embed_o) / 2
fish_embed

tensor([[ 0.0139, -0.7598]], grad_fn=<DivBackward0>)

In [33]:
fish_embed_o

tensor([[-0.5562, -0.8370]], grad_fn=<EmbeddingBackward0>)

In [34]:
def get_embed(word):
    try:
        index = word2index[word]
    except:
        index = word2index['<UNK>']
        
    word = torch.LongTensor([word2index[word]])
    
    embed_c = model.embedding_center(word)
    embed_o = model.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2
    
    return embed[0][0].item(), embed[0][1].item()

In [35]:
get_embed('fruit')

(-0.03139428794384003, 0.0495094358921051)

In [36]:
get_embed('cat')

(-0.20721715688705444, -0.3159995675086975)

In [37]:
get_embed('dog')

(-0.04596894979476929, 0.18158817291259766)

In [38]:
get_embed('fish')

(0.013891816139221191, -0.7598029375076294)

## 7. Cosine similarity

In [39]:
fish = get_embed('fish')
fish

(0.013891816139221191, -0.7598029375076294)

In [40]:
fruit = get_embed('fruit')
fruit

(-0.03139428794384003, 0.0495094358921051)

In [41]:
cat = get_embed('cat')
cat

(-0.20721715688705444, -0.3159995675086975)

In [42]:
np.array(fish) @ np.array(cat)

0.2372187769998817

In [43]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

print(cosine_similarity(np.array(fish), np.array(cat)))
print(cosine_similarity(np.array(fish), np.array(fruit)))

0.8260750562076561
-0.8541719084896701


In [44]:
# Create a pickle of the model
import pickle

with open('skipgram_neg.pkl', 'wb') as f:
    pickle.dump(model, f)