## CBOW - Word2Vec Implementation Pytorch

In [1]:
import re
import nltk
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from nltk.corpus import webtext
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 200
%matplotlib inline

Using TensorFlow backend.


In [2]:
import nltk
nltk.download('stopwords')
nltk.download('webtext')
from nltk.corpus import brown

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sajid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\Sajid\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!


## Pre-Processing text Code

In [12]:
wordpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wordpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [13]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, eggs, toast and beans",food
4,"I love green eggs, ham, sausages and bacon!",food
5,The brown fox is quick and the blue dog is lazy!,animals
6,The sky is very blue and the sky is very beautiful today,weather
7,The dog is lazy but the brown fox is quick!,animals


In [14]:
# build a sample vocab
vocab = []
print(webtext.fileids())
print(len(webtext.raw('firefox.txt'))) 
for fileid in webtext.fileids():
    vocab.append(webtext.raw('firefox.txt'))

    #print(brown.raw('cb01').strip()[:1000])  

['firefox.txt', 'grail.txt', 'overheard.txt', 'pirates.txt', 'singles.txt', 'wine.txt']
564601


### text preprocessing (Remove tags e.g HTML,Remove special characters, Remove stopwords) === Clean data

In [15]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(corpus)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in corpus]

vocab_size = len(word2id)
embed_size = 100
window_size = 2

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 31
Vocabulary Sample: [('the', 1), ('is', 2), ('and', 3), ('sky', 4), ('blue', 5), ('beautiful', 6), ('quick', 7), ('brown', 8), ('fox', 9), ('lazy', 10)]


### [context_words, target_word] pairs

In [16]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    X = []
    Y = []
    context_length = window_size*2
    for words in wids:
        sentence_length = len(words)
        for index, word in enumerate(words):           
            start = index - window_size
            end = index + window_size + 1
            context = [words[i] for i in range(start, end)if 0 <= i < sentence_length and i != index]
            x = sequence.pad_sequences([context], maxlen=context_length)
            X.append(x)
            Y.append(word)
    return X,Y

## CBOW (Contineous bag of Words Model architecture)

In [38]:
import torch
import torch.nn as nn
import numpy as np

class CBOW(torch.nn.Module):

    def __init__(self, inp_size , vocab_size, embedding_dim=100):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 100)
        self.activation_function1 = nn.ReLU()        
        self.linear2 = nn.Linear(100, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        
    def forward(self, inputs):
        embeds = sum(self.embeddings(torch.from_numpy(inputs).long().cuda())).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    
model = CBOW(window_size*2,vocab_size).cuda()
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

torch.save(model.state_dict(), "Cbow_Weights")

# model = TheModelClass(*args, **kwargs)
# model.load_state_dict(torch.load(PATH))
# model.eval()

Model's state_dict:
embeddings.weight 	 torch.Size([31, 100])
linear1.weight 	 torch.Size([100, 100])
linear1.bias 	 torch.Size([100])
linear2.weight 	 torch.Size([31, 100])
linear2.bias 	 torch.Size([31])
Optimizer's state_dict:
state 	 {}
param_groups 	 [{'lr': 0.001, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [2698026040488, 2698026040416, 2698026040560, 2698026040632, 2698026040704]}]


In [43]:
for epoch in range(1, 100):
    loss = 0
    i = 0
    X,Y = generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size)
    for x, y in zip(X,Y):
        i += 1
        optimizer.zero_grad()
        log_probs = model(x[0])
        loss = loss_function(log_probs,torch.Tensor([y]).long().cuda())
        loss.backward()
        optimizer.step()
        loss += loss.data
    print('Epoch:', epoch, '\tLoss:', loss)

Epoch: 1 	Loss: tensor(9.2254, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 2 	Loss: tensor(8.5779, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 3 	Loss: tensor(8.2109, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 4 	Loss: tensor(7.9904, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 5 	Loss: tensor(7.7918, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 6 	Loss: tensor(7.5436, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 7 	Loss: tensor(7.2815, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 8 	Loss: tensor(7.0081, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 9 	Loss: tensor(6.7421, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 10 	Loss: tensor(6.4857, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 11 	Loss: tensor(6.2545, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 12 	Loss: tensor(6.0239, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 13 	Loss: tensor(5.8107, device='cuda:0', grad_fn=<AddBackward0>)
Epoch: 14 	Loss: tensor(5.6046, device='cuda:0', grad_fn=<Ad

In [44]:
weights = model.embeddings(torch.Tensor([list(range(0,vocab_size))]).long().cuda())

pd.DataFrame(weights.view(-1,100).tolist(), index=list(id2word.values())[0:]).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,0.246579,-0.562335,0.043182,1.90539,2.897158,-0.94587,-1.171224,-0.234848,-0.303074,-1.106695,...,0.398234,2.262729,-1.240966,-1.615071,0.430559,1.204429,1.962709,-0.705347,2.702054,0.223823
is,-0.128564,-0.437242,-0.335543,-0.279052,-1.862769,-1.401087,2.30153,0.005477,0.660442,0.530043,...,0.830228,1.139872,1.804603,1.048361,0.612678,0.142685,-0.640898,0.269062,-1.383707,0.493586
and,0.147716,1.590648,-0.531452,0.364238,-2.081264,1.953997,0.312316,0.132126,0.342545,1.614952,...,0.516661,1.344398,0.010004,-0.000935,-0.911071,-1.500323,0.006096,-0.90833,0.094563,0.560542
sky,0.798209,0.580648,-0.940805,1.174904,0.018244,0.258995,0.891437,-0.153361,-0.161141,1.60359,...,1.090361,0.094175,-1.023437,-0.071496,0.39797,-0.808378,-0.582422,0.041686,0.695446,-0.152871
blue,0.167987,1.054565,0.945732,1.07018,-0.046181,0.503633,0.014728,-1.5162,0.631779,0.07318,...,0.178417,1.100883,-0.255432,0.796993,0.631346,-1.623453,0.385617,-0.030791,1.651315,-1.189646
beautiful,-0.910202,-0.996826,0.262043,-0.553207,1.099605,0.841212,-0.795247,-0.425355,-0.631225,-1.262379,...,0.956158,-1.592699,0.07486,-0.156407,-0.830934,0.879292,1.34959,0.840808,-0.751444,1.213342
quick,1.481157,0.320704,-0.216832,-0.109891,-0.586104,0.447652,-0.05632,-1.332508,1.544559,-0.488736,...,0.380646,0.985778,-0.064054,-1.262825,-1.319752,-0.027141,1.897707,-0.377066,-0.986022,-0.377845
brown,0.343019,0.48202,1.034114,-0.191746,1.860478,0.331806,-1.634989,-0.210357,-0.323028,0.536377,...,0.15001,0.171598,0.327116,0.859752,-0.426213,0.081196,0.082185,0.586825,0.126417,1.557284
fox,1.301593,-0.507659,-1.305631,-0.958802,0.064477,-0.490074,-0.459826,-0.253821,-1.890259,0.171855,...,0.673046,1.608326,0.475421,-0.593568,0.390986,1.883014,-1.988344,-1.608087,0.96023,0.491502
lazy,-0.576814,0.395293,-1.719962,-0.775848,1.042978,-0.482781,1.346323,1.020422,0.363774,-1.230762,...,1.426381,-1.102519,0.683267,-0.122452,0.89115,-0.426539,-0.195252,0.478174,-0.834393,0.212006


In [45]:
from sklearn.metrics.pairwise import euclidean_distances

weights = weights.view(-1,100)
distance_matrix = euclidean_distances(weights.cpu().detach().numpy())
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:4]+1] 
                 for search_term in ['the', 'fox', 'beautiful','brown','lazy']}

similar_words

{'the': ['quick', 'blue', 'bacon'],
 'fox': ['eggs', 'i', 'this'],
 'beautiful': ['lazy', 'this', 'a'],
 'brown': ['green', 'very', 'ham'],
 'lazy': ['beautiful', 'i', 'fox']}