Source: [xssChauhan/word2vec] (https://github.com/xssChauhan/word2vec/blob/master/pytorch/CBOW.ipynb)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torch.nn.functional as F
from torch.autograd import Variable

In [3]:
from nltk.tokenize import word_tokenize

# Source for text excerpt: 
# https://www.advancedwriters.com/blog/descriptive-essay-on-nature/

text = '''Man has appreciated nature and still does. He is both challenged 
and pacified by it. Not only is nature beautiful, it is every changing through 
different seasons, or timelessly unchanged in it fixed elements such as its 
great mountain ranges. It has a wild beauty to it. There is a valley in central 
Africa that when you are there it seems as if you went back in time. This is 
the Zambezi river valley that starts in the wetlands of the Okavango swamps. 
The valley is 1500 miles of wilderness, totally unspoiled by man’s encroachment. 
You see only the wildness of nature. The river flows proudly through the valley. 
It is a surging force as it goes through rocky rapids, or wide and tranquil where 
it finds space. On its banks are mud flats and reeds, where crocodiles lie in the sun, 
and further away dense trees and forests of Mopani trees, interspersed with huge grey 
prehistoric baobab trees with branches that look like roots. In the day, the sun is a 
burning yellow fire, and everything wilts under it. Even the wild life finds shade and 
lies down. As the evening comes the setting sun paints the sky with streaks of pink and 
orange, and the animals emerge.

They come individually or in groups. In the water large hippopotamus frolic, 
not intimidated by the presence of crocodiles. Nervous buck come dancing to 
the river.

Large tan colored kudu, as tall as a horse, with their white flashes and meter 
long spiral horns, smaller dark brown impala with short spiked horns, tiny 
brown duiker.

They carefully approach; stopping to be sure, no predators are near. They dip 
their heads gracefully to drink. Some suddenly will jump and struggle as a 
crocodile grabs it and drags it under the water. Elephants come and splash 
around squirting water over themselves with their long trunks, or rolling in 
the mud, which is to them a treat.

Lions eventually arrive in a pride, causing the buck to move nervously away. 
The dusk gives way to the sudden blackness of the night sky studded with silver 
stars and a huge silver moon. Soon the animals were gone; the river flows on 
into the night.

Not far away there was a noise like thunder that sounded constantly. In the 
early morning, flowing the river alive and sparkling in the sun, crocodiles 
basking in the warmth, animals drinking while it was still cool, the river 
broadened and flowed in channels around green islands. Then it fell down a 
100-meter chasm as a magnificent waterfall, 1708 meters wide. As the river 
fell down the chasm the sound was as thunder, and water spray rose high in 
the sky, white like the smoke of a bush fire. The bush is like a tropical 
forest as the spray rains down on it continually, and it is untouched by man. 
From here, it flows into a great lake and thence to the Indian Ocean.''' \
    .lower() # note no splitting here like in CBOW


# NOTE: must download nltk's punkt tokenizer (technicalities evernote) for this to work. 
words = word_tokenize(text)

vocabulary = set(words)
wordToIndex = {w:i for i, w in enumerate(vocabulary)}
indexToWord = {i:w for i, w in enumerate(vocabulary)}

In [10]:
print(len(wordToIndex))
print(wordToIndex)

262
{'seasons': 0, 'appreciated': 1, '’': 2, 'roots': 3, 'spiral': 4, 'approach': 5, 'magnificent': 6, 'surging': 7, 'both': 8, 'tan': 9, 'noise': 10, 'into': 11, 'will': 12, 'splash': 13, 'rapids': 14, 'burning': 15, 'flows': 16, 'presence': 17, 'flashes': 18, 'be': 19, 'see': 20, 'kudu': 21, 'drinking': 22, 'wild': 23, 'squirting': 24, 'back': 25, 'totally': 26, '.': 27, 'day': 28, 'he': 29, 'they': 30, 'dark': 31, 'tiny': 32, ';': 33, 'studded': 34, 'some': 35, 'alive': 36, 'is': 37, 'spray': 38, 'rose': 39, 'lake': 40, 'meter': 41, 'sudden': 42, 'flats': 43, 'forests': 44, 'mopani': 45, 'challenged': 46, 'water': 47, 'smoke': 48, 'central': 49, 'meters': 50, 'mountain': 51, '100-meter': 52, 'banks': 53, 'even': 54, 'river': 55, 'dusk': 56, 'islands': 57, 'unspoiled': 58, 'wildness': 59, 'grabs': 60, 'thunder': 61, 'fire': 62, 'or': 63, 'individually': 64, 'dancing': 65, 'baobab': 66, 'basking': 67, 'yellow': 68, 'thence': 69, 'branches': 70, 'predators': 71, 'the': 72, 'down': 73, 

In [11]:
from types import SimpleNamespace
import random
random.seed(42)

In [12]:
def generateNegativeSamples(targetIndex, indexRange, k):
    """
    
    :param targetIndex: 
    :param indexRange: ranges of index to select from
    :param k: 
    :return: 
    """
    
    randomIndicesSample = random.sample(population=indexRange, k=6)
    
    return SimpleNamespace(
        target=wordToIndex[words[targetIndex]], 
        context=[wordToIndex[word] for word in [words[index] for index in randomIndicesSample]],
        label = 0
    )

In [13]:
def textToTrain(words, contextWindowSize=2, k=6):
    """
    Make training data from words. 
    For 1 positive sample, generate `k` negative samples
    
    :param words: 
    :param contextWindowSize: 
    :param k: 
    :return: 
    """
    # TODO: are these samples words / tensors??
    posSamples = []
    negSamples = []
    
    contextRange = range(-contextWindowSize, contextWindowSize + 1)
    
    for currIndex in range(contextWindowSize, len(words) - contextWindowSize):
        
        # Create positive samples
        for relativeIndex in contextRange:
            if currIndex + relativeIndex != currIndex:
                posSamples.append(SimpleNamespace(
                    target=wordToIndex[words[currIndex]],
                    context=wordToIndex[words[currIndex + relativeIndex]],
                    label = 1
                ))
                
        # Create negative samples
        for _ in contextRange:
            
            randNum = random.random()
            
            leftSideIndexRange = None
            rightSideIndexRange = None 
            
            # Select from left hand side of target
            if (currIndex - contextWindowSize - 2*k) > 0:
                # This also accounts for the fact that there should be
                # enough samples on the LHS to select from
                leftSideIndexRange = range(0, currIndex - contextWindowSize)
                
            if (currIndex + contextWindowSize + 2*k) < len(words):
                # If random value is >= 0.5 or there are not enough samples
                # on the LHS, then ...
                rightSideIndexRange = range(currIndex + contextWindowSize, len(words))
                
            if leftSideIndexRange and rightSideIndexRange:
                # pick the left or right arbitrarily
                indexRange = random.choice([leftSideIndexRange, rightSideIndexRange])
            elif leftSideIndexRange:
                indexRange = leftSideIndexRange
            else:
                indexRange = rightSideIndexRange
                
            negSamples.append(
                generateNegativeSamples(
                    targetIndex=currIndex, 
                    indexRange=indexRange, 
                    k=k
                )
            )
            
    return posSamples, negSamples

In [15]:
posData, negData = textToTrain(words)

print(posData[:10])
print("\n")
print(negData[:10])

[namespace(context=143, label=1, target=1), namespace(context=195, label=1, target=1), namespace(context=110, label=1, target=1), namespace(context=229, label=1, target=1), namespace(context=195, label=1, target=110), namespace(context=1, label=1, target=110), namespace(context=229, label=1, target=110), namespace(context=132, label=1, target=110), namespace(context=1, label=1, target=229), namespace(context=110, label=1, target=229)]


[namespace(context=[229, 24, 204, 215, 37, 63], label=0, target=1), namespace(context=[160, 229, 133, 243, 234, 55], label=0, target=1), namespace(context=[209, 162, 201, 220, 220, 57], label=0, target=1), namespace(context=[228, 27, 220, 27, 175, 234], label=0, target=1), namespace(context=[133, 150, 224, 117, 72, 89], label=0, target=1), namespace(context=[44, 224, 37, 30, 226, 35], label=0, target=110), namespace(context=[140, 57, 227, 229, 73, 185], label=0, target=110), namespace(context=[133, 133, 87, 133, 168, 40], label=0, target=110), namespace

In [17]:
def unpackDataPoint(dataPoint):
    return dataPoint.target, dataPoint.context, dataPoint.label 

def dataToVariable(data, dtype=torch.LongTensor):
    tensor = Variable(dtype(data))
    return tensor 

In [18]:
class SkipGram(nn.Module):
    
    def __init__(self, vocabSize, embeddingSize):
        super().__init__()
        self.targetEmbedding = nn.Embedding(vocabSize, embeddingSize)
        self.contextEmbedding = nn.Embedding(vocabSize, embeddingSize)
        
        
    def forward(self, target, positiveContext, negativeContext):
        targetTensor = dataToVariable([target])
        posContextTensor = dataToVariable([positiveContext])
        negContextTensor = dataToVariable([negativeContext])
        
        posEmbedding = self.contextEmbedding(posContextTensor)
        negEmbedding = self.contextEmbedding(negContextTensor)
        targetEmbedding = self.targetEmbedding(targetTensor)
        
        posDot = torch.matmul(posEmbedding, torch.t(targetEmbedding))
        negDot = torch.matmul(targetEmbedding, torch.t(-negEmbedding.squeeze()))
        
        # Calculate the loss
        loss = -(F.logsigmoid(posDot) + F.logsigmoid(negDot).sum())
        
        # Maximize the `loss`, hence, minimize the `negative loss`
        return loss 
        

In [19]:
# Testin the moddel with dummy data

from torch.autograd import Variable 

In [20]:
posSample = 1
negSample = [10,11,12]
target = 0

model = SkipGram(vocabSize=20, embeddingSize=10)
loss = model(target, posSample, negSample)

In [21]:
print(model)
print(loss)

SkipGram(
  (targetEmbedding): Embedding(20, 10)
  (contextEmbedding): Embedding(20, 10)
)
tensor([[3.6973]], grad_fn=<NegBackward>)


In [22]:
del model 
del loss

In [23]:
## Train the model
learningRate = 0.001
epochs = 100

In [25]:
model = SkipGram(vocabSize=len(vocabulary), embeddingSize=300)
optimizer = optim.SGD(model.parameters(), lr = learningRate)

In [26]:
print(negData[1])

namespace(context=[160, 229, 133, 243, 234, 55], label=0, target=1)
