In [2]:
from gensim.test.utils import common_texts

from gensim.models import Word2Vec

model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model

<gensim.models.word2vec.Word2Vec at 0x1c8f34948e0>

<gensim.models.word2vec.Word2Vec at 0x1c891d48b50>

## Word Embeddings

https://neptune.ai/blog/word-embeddings-guide

In [9]:
raw_sentence = "This introduced a neural network architecture approach that laid the foundation for many current approaches. "
#making a list of all characters without the spaces between
word_list = " ".join(raw_sentence).split()
#making the list elements unique
word_list = list(set(word_list))
#iterate the word_list, to make a dictionary as map
word2id = {w: i for i, w in enumerate(word_list)}
id2word = {i: w for i, w in enumerate(word_list)}
n_class = len(word2id)


{'h': 0,
 'k': 1,
 'c': 2,
 'a': 3,
 '.': 4,
 'u': 5,
 'l': 6,
 'p': 7,
 'w': 8,
 'm': 9,
 'i': 10,
 't': 11,
 'T': 12,
 's': 13,
 'n': 14,
 'd': 15,
 'y': 16,
 'o': 17,
 'e': 18,
 'f': 19,
 'r': 20}

In [4]:
import torch
import torch.nn as nn
m = 0
n_step = 0
n_hidden = 0

class NNLM(nn.Module):
   def __init__(self):
       super(NNLM, self).__init__()
       self.embeddings = nn.Embedding(n_class, m) #embedding layer or look up table

       self.hidden1 = nn.Linear(n_step * m, n_hidden, bias=False)
       self.ones = nn.Parameter(torch.ones(n_hidden))

       self.hidden2 = nn.Linear(n_hidden, n_class, bias=False)
       self.hidden3 = nn.Linear(n_step * m, n_class, bias=False) #final layer

       self.bias = nn.Parameter(torch.ones(n_class))

   def forward(self, X):
       X = self.embeddings(X) # embeddings
       X = X.view(-1, n_step * m) # first layer
       tanh = torch.tanh(self.d + self.hidden1(X)) # tanh layer
       output = self.b + self.hidden3(X) + self.hidden2(tanh) # summing up all the layers with bias
       return output

['h',
 'k',
 'c',
 'a',
 '.',
 'u',
 'l',
 'p',
 'w',
 'm',
 'i',
 't',
 'T',
 's',
 'n',
 'd',
 'y',
 'o',
 'e',
 'f',
 'r']

## CBOW

In [10]:
def CBOW(raw_text, window_size=2):
   data = []
   for i in range(window_size, len(raw_text) - window_size):
       context = [raw_text[i - window_size], raw_text[i - (window_size - 1)], raw_text[i + (window_size - 1)], raw_text[i + window_size]]
       target = raw_text[i]
       data.append((context, target))

   return data

In [11]:
CBOW(raw_sentence)

[(['T', 'h', 's', ' '], 'i'),
 (['h', 'i', ' ', 'i'], 's'),
 (['i', 's', 'i', 'n'], ' '),
 (['s', ' ', 'n', 't'], 'i'),
 ([' ', 'i', 't', 'r'], 'n'),
 (['i', 'n', 'r', 'o'], 't'),
 (['n', 't', 'o', 'd'], 'r'),
 (['t', 'r', 'd', 'u'], 'o'),
 (['r', 'o', 'u', 'c'], 'd'),
 (['o', 'd', 'c', 'e'], 'u'),
 (['d', 'u', 'e', 'd'], 'c'),
 (['u', 'c', 'd', ' '], 'e'),
 (['c', 'e', ' ', 'a'], 'd'),
 (['e', 'd', 'a', ' '], ' '),
 (['d', ' ', ' ', 'n'], 'a'),
 ([' ', 'a', 'n', 'e'], ' '),
 (['a', ' ', 'e', 'u'], 'n'),
 ([' ', 'n', 'u', 'r'], 'e'),
 (['n', 'e', 'r', 'a'], 'u'),
 (['e', 'u', 'a', 'l'], 'r'),
 (['u', 'r', 'l', ' '], 'a'),
 (['r', 'a', ' ', 'n'], 'l'),
 (['a', 'l', 'n', 'e'], ' '),
 (['l', ' ', 'e', 't'], 'n'),
 ([' ', 'n', 't', 'w'], 'e'),
 (['n', 'e', 'w', 'o'], 't'),
 (['e', 't', 'o', 'r'], 'w'),
 (['t', 'w', 'r', 'k'], 'o'),
 (['w', 'o', 'k', ' '], 'r'),
 (['o', 'r', ' ', 'a'], 'k'),
 (['r', 'k', 'a', 'r'], ' '),
 (['k', ' ', 'r', 'c'], 'a'),
 ([' ', 'a', 'c', 'h'], 'r'),
 (['a', 'r

In [None]:
class CBOW_Model(torch.nn.Module):
   def __init__(self, vocab_size, embedding_dim):
       super(CBOW_Model, self).__init__()

       self.embeddings = nn.Embedding(vocab_size, embedding_dim)
       self.linear1 = nn.Linear(embedding_dim, 128)
       self.activation_function1 = nn.ReLU()

       self.linear2 = nn.Linear(128, vocab_size)


   def forward(self, inputs):
       embeds = sum(self.embeddings(inputs)).view(1,-1)
       out = self.linear1(embeds)
       out = self.activation_function1(out)
       out = self.linear2(out)
       return out