### Building CBOW Model

In [1]:
import torch
import torch.nn as nn
import numpy as np

In [2]:
text = """How that personage haunted my dreams, I need scarcely tell you. On
stormy nights, when the wind shook the four corners of the house and
the surf roared along the cove and up the cliffs, I would see him in a
thousand forms, and with a thousand diabolical expressions. Now the leg
would be cut off at the knee, now at the hip, now he was a monstrous
kind of a creature who had never had but the one leg, and that in the
middle of his body. To see him leap and run and pursue me over hedge and
ditch was the worst of nightmares. And altogether I paid pretty dear for
my monthly fourpenny piece, in the shape of these abominable fancies"""

In [3]:
text = text.replace(',','').replace('.','').lower().split()

In [4]:
corpus = set(text)
corpus_length = len(corpus)

In [5]:
word_dict = {}
inverse_word_dict = {}
for i ,word in enumerate(corpus):
    word_dict[word] = i
    inverse_word_dict[i] = word

In [6]:
word_dict

{'corners': 0,
 'pretty': 1,
 'at': 2,
 'had': 3,
 'house': 4,
 'never': 5,
 'four': 6,
 'scarcely': 7,
 'fancies': 8,
 'nightmares': 9,
 'pursue': 10,
 'diabolical': 11,
 'who': 12,
 'tell': 13,
 'now': 14,
 'me': 15,
 'need': 16,
 'kind': 17,
 'nights': 18,
 'piece': 19,
 'his': 20,
 'of': 21,
 'monthly': 22,
 'over': 23,
 'cliffs': 24,
 'thousand': 25,
 'and': 26,
 'altogether': 27,
 'body': 28,
 'run': 29,
 'when': 30,
 'middle': 31,
 'on': 32,
 'one': 33,
 'off': 34,
 'to': 35,
 'wind': 36,
 'shape': 37,
 'cut': 38,
 'these': 39,
 'for': 40,
 'along': 41,
 'knee': 42,
 'hip': 43,
 'dear': 44,
 'creature': 45,
 'ditch': 46,
 'he': 47,
 'haunted': 48,
 'leg': 49,
 'leap': 50,
 'personage': 51,
 'i': 52,
 'be': 53,
 'my': 54,
 'surf': 55,
 'forms': 56,
 'you': 57,
 'hedge': 58,
 'how': 59,
 'was': 60,
 'paid': 61,
 'with': 62,
 'in': 63,
 'stormy': 64,
 'worst': 65,
 'a': 66,
 'would': 67,
 'fourpenny': 68,
 'dreams': 69,
 'the': 70,
 'cove': 71,
 'that': 72,
 'shook': 73,
 'roared':

In [7]:
inverse_word_dict

{0: 'corners',
 1: 'pretty',
 2: 'at',
 3: 'had',
 4: 'house',
 5: 'never',
 6: 'four',
 7: 'scarcely',
 8: 'fancies',
 9: 'nightmares',
 10: 'pursue',
 11: 'diabolical',
 12: 'who',
 13: 'tell',
 14: 'now',
 15: 'me',
 16: 'need',
 17: 'kind',
 18: 'nights',
 19: 'piece',
 20: 'his',
 21: 'of',
 22: 'monthly',
 23: 'over',
 24: 'cliffs',
 25: 'thousand',
 26: 'and',
 27: 'altogether',
 28: 'body',
 29: 'run',
 30: 'when',
 31: 'middle',
 32: 'on',
 33: 'one',
 34: 'off',
 35: 'to',
 36: 'wind',
 37: 'shape',
 38: 'cut',
 39: 'these',
 40: 'for',
 41: 'along',
 42: 'knee',
 43: 'hip',
 44: 'dear',
 45: 'creature',
 46: 'ditch',
 47: 'he',
 48: 'haunted',
 49: 'leg',
 50: 'leap',
 51: 'personage',
 52: 'i',
 53: 'be',
 54: 'my',
 55: 'surf',
 56: 'forms',
 57: 'you',
 58: 'hedge',
 59: 'how',
 60: 'was',
 61: 'paid',
 62: 'with',
 63: 'in',
 64: 'stormy',
 65: 'worst',
 66: 'a',
 67: 'would',
 68: 'fourpenny',
 69: 'dreams',
 70: 'the',
 71: 'cove',
 72: 'that',
 73: 'shook',
 74: 'roar

In [8]:
data = []
for i in range(2,len(text)-2):
    sentence = [text[i-2],text[i-1],text[i+1],text[i+2]]
    target = text[i]
    data.append((sentence,target))
data[0]

(['how', 'that', 'haunted', 'my'], 'personage')

In [9]:
embedding_length = 20

In [19]:
class CBOW(torch.nn.Module):
    def __init__(self, corpus_length, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(corpus_length,
        embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 64)
        self.linear2 = nn.Linear(64, corpus_length)
        self.activation_function1 = nn.ReLU()
        self.activation_function2 = nn.LogSoftmax(dim = -1)
    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out
    def get_word_emdedding(self, word):
        word = torch.LongTensor([word_dict[word]])
        return self.embeddings(word).view(1,-1)

In [20]:
model = CBOW(corpus_length, embedding_length)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
def make_sentence_vector(sentence, word_dict):
    idxs = [word_dict[w] for w in sentence]
    return torch.tensor(idxs, dtype=torch.long)
print(make_sentence_vector(['stormy','nights','when','the'], word_dict))

tensor([64, 18, 30, 70])


In [21]:
for epoch in range(100):
    epoch_loss = 0
    for sentence, target in data:
        model.zero_grad()
        sentence_vector = make_sentence_vector(sentence, word_dict)
        log_probs = model(sentence_vector)
        loss = loss_function(log_probs, torch.tensor([word_dict[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.data
    print('Epoch: '+str(epoch)+', Loss: ' + str(epoch_loss.item()))

Epoch: 0, Loss: 529.4085693359375
Epoch: 1, Loss: 475.45147705078125
Epoch: 2, Loss: 435.5121154785156
Epoch: 3, Loss: 398.7772216796875
Epoch: 4, Loss: 362.43096923828125
Epoch: 5, Loss: 325.6566162109375
Epoch: 6, Loss: 288.5445556640625
Epoch: 7, Loss: 251.77365112304688
Epoch: 8, Loss: 216.1964111328125
Epoch: 9, Loss: 182.65499877929688
Epoch: 10, Loss: 152.080810546875
Epoch: 11, Loss: 125.14311981201172
Epoch: 12, Loss: 101.84101104736328
Epoch: 13, Loss: 82.38603973388672
Epoch: 14, Loss: 66.56572723388672
Epoch: 15, Loss: 53.88785934448242
Epoch: 16, Loss: 43.94063949584961
Epoch: 17, Loss: 36.15391159057617
Epoch: 18, Loss: 30.191171646118164
Epoch: 19, Loss: 25.556697845458984
Epoch: 20, Loss: 21.962766647338867
Epoch: 21, Loss: 19.133319854736328
Epoch: 22, Loss: 16.87236785888672
Epoch: 23, Loss: 15.041474342346191
Epoch: 24, Loss: 13.523797988891602
Epoch: 25, Loss: 12.269386291503906
Epoch: 26, Loss: 11.197855949401855
Epoch: 27, Loss: 10.292583465576172
Epoch: 28, Loss:

In [22]:
def get_predicted_result(input, inverse_word_dict):
    index = np.argmax(input)
    return inverse_word_dict[index]
def predict_sentence(sentence):
    sentence_split = sentence.replace('.','').lower().split()
    sentence_vector = make_sentence_vector(sentence_split, word_dict)
    prediction_array = model(sentence_vector).data.numpy()
    print('Preceding Words: {}\n'.format(sentence_split[:2]))
    print('Predicted Word: {}\n'.format(get_predicted_result(prediction_array[0], inverse_word_dict)))
    print('Following Words: {}\n'.format(sentence_split[2:]))
predict_sentence('to see leap and')

Preceding Words: ['to', 'see']

Predicted Word: him

Following Words: ['leap', 'and']



In [23]:
print(model.get_word_emdedding('leap'))

tensor([[ 1.8213, -1.1196,  0.6845,  0.8898,  1.5934,  1.3590, -2.8483, -0.2130,
          0.9974, -0.5111, -0.2763, -1.2185,  0.0490,  1.4361, -0.7034, -0.3596,
         -1.4976,  1.0774,  0.0057,  0.4883]], grad_fn=<ViewBackward0>)
