## Skip-Gram - Word2Vec implementation in Pytorch 

In [None]:
import re
import nltk
import numpy as np
import pandas as pd
import torch as nn
from string import punctuation
from nltk.corpus import webtext
from nltk.corpus import gutenberg
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import text
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('webtext')

In [None]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

## PreProcesing Codes

In [None]:
wordpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wordpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

## Vocabulary 

In [None]:
# build a sample vocab
vocab = []

for fileid in webtext.fileids():
    vocab.append(webtext.raw(fileid))

In [None]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(corpus)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in corpus]
vocab_size = len(word2id)
embed_size = 100
window_size = 2 

## [target_word to context_words pairs and labels]

In [None]:
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids]
pairs, labels = skip_grams[0][0], skip_grams[0][1]

# Skip Gram Model Architecture 

In [None]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

class skipgram(nn.Module):
    
  def __init__(self, vocab_size, embedding_dim=100):
    super(skipgram, self).__init__()
    
    self.u_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)   
    self.v_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True) 
    self.lin = nn.Linear(embedding_dim,1)
 
  def forward(self, u_pos, v_pos ):

    embed_u = self.u_embeddings(torch.Tensor([u_pos]).long())
    embed_v = self.v_embeddings(torch.Tensor([v_pos]).long())
    score  = torch.mul(embed_u, embed_v)
    score = self.lin(score)
    print(score)
    target = F.sigmoid(score).squeeze()
    print(target)
    return target

model = skipgram(vocab_size)
loss_function = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

## Train Model

In [None]:
for epoch in range(1, 10):
  tloss = 0
  for i, elem in enumerate(skip_grams):
    pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
    pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
    labels = np.array(elem[1], dtype='int32')
    optimizer.zero_grad()
    for v,u,l in zip(pair_first_elem,pair_second_elem,labels):            
      p = model(v, u).unsqueeze(-1)
      loss = loss_function(p,torch.Tensor([1]))
      loss += loss.data
      loss.backward()
      tloss+=loss
      optimizer.step()
    print('Epoch:', epoch, '\tLoss:', tloss)

In [None]:
weights = model.u_embeddings(torch.Tensor([list(range(0,vocab_size))]).long())
pd.DataFrame(weights.view(-1,100).tolist(), index=list(id2word.values())[0:]).head(10)

In [None]:
weights = weights.view(-1,100)
distance_matrix = euclidean_distances(weights.detach().numpy())

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in [ 'is', 'fox', 'and','brown','lazy']}

similar_words