## Problem Statement 7
### Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 2) using the below steps:
    a. Data preparation
    b. Generate training data
    c. Train model
    d. Output

In [1]:
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical  # instead of np_utils


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras import backend as K

from sklearn.metrics.pairwise import euclidean_distances

import numpy as np
import pandas as pd

In [2]:
data="""But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?"""

dl_data=data.split()

## a. Data preparation

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dl_data)

words2id = tokenizer.word_index
words2id['PAD']=0

id2words = {v:k for k,v in words2id.items()}

wids = [[words2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size=len(words2id)
embed_size=100
window_size=2

In [4]:
print("Vocabulary size: ", vocab_size)
print("Vocabulary items: ", list(words2id.items())[:10])

Vocabulary size:  102
Vocabulary items:  [('to', 1), ('of', 2), ('pleasure', 3), ('pain', 4), ('a', 5), ('the', 6), ('who', 7), ('but', 8), ('and', 9), ('or', 10)]


## b. Generating training data

In [5]:
def pairwise(corpus, window_size, vocab_size):
    context_length=window_size*2

    for words in corpus:
        sentence_length=len(words)
        for index, word in enumerate(words):
            context_words=[]
            label_word=[]
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i]
                                 for i in range(start, end)
                                 if 0 <= i < sentence_length
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield(x,y)

## c. Training the model

In [6]:
cbow = Sequential()

cbow.add(Embedding(input_dim = vocab_size, output_dim = embed_size, input_length = window_size * 2))
cbow.add(Lambda(lambda x:K.mean(x, axis = 1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation="softmax"))

cbow.compile(loss="categorical_crossentropy", optimizer="rmsprop")



In [7]:
print(cbow.summary())

None


In [8]:
for epoch in range(1,3):
    loss=0

    for x,y in pairwise(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        loss += cbow.train_on_batch(x, y)
    print("Epoch: {} Loss: {}".format(epoch,loss))
    print()

Epoch: 1 Loss: 772.9935841560364

Epoch: 2 Loss: 769.6444368362427



In [9]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2words.values())[1:]).head()

(101, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
of,0.040641,0.011549,0.045073,-0.042532,0.039772,0.040936,0.038031,-0.022964,-0.024844,-0.035216,...,-0.039629,0.033493,-0.039055,0.049727,-0.045233,-0.01163,0.041958,-0.039132,-0.031351,-0.027502
pleasure,0.023873,0.033676,-0.034842,0.013925,-0.031948,-0.024719,-0.026599,-0.028809,0.015129,0.037716,...,-0.039123,0.039247,0.007857,0.036693,-0.024683,-0.006513,-0.00865,0.010335,0.040352,0.016129
pain,0.040286,0.011894,0.038814,0.047646,0.001465,-0.025471,0.001414,0.029952,0.018181,0.020947,...,0.007485,0.037355,0.003129,-0.033676,-0.009741,0.021619,-0.019041,-0.043634,0.037323,0.012385
a,-0.040206,0.035498,0.027908,0.017308,0.048137,0.023147,-0.016166,-0.042422,0.007062,0.039826,...,0.04583,0.037929,-0.047885,-0.029244,-0.022886,-0.040069,-0.004594,-0.000144,0.049407,-0.010357
the,-0.047279,0.022679,-0.021031,-0.029547,-0.024062,0.049753,-0.017282,0.038298,-0.006096,0.027492,...,-0.025104,-0.011023,0.00373,-0.032624,0.021004,-0.017754,0.014316,-0.026512,0.024815,0.000801


## d. Output

In [10]:
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

(101, 101)


In [12]:
inwords = input()

similar_words={ search_term: [id2words[idx] for idx in distance_matrix[words2id[search_term]-1].argsort()[0:6]]
              for search_term in {inwords}}
similar_words

 mistaken


{'mistaken': ['this', 'denouncing', 'avoids', 'nor', 'no', 'painful']}