Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 1) using the below steps:

a. Data preparation

b. Generate training data

c. Train model

d. Output

In [49]:
#imports

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [50]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.optimizers import SGD
from keras.utils import to_categorical
from sklearn.metrics.pairwise import euclidean_distances

In [51]:
# data = 'One morning as Gregor Samsa was waking up from his anxious dreams, he discovered that in his bed, he had been changed into a monsterous verminous bug. '

In [52]:
import pandas as pd
data=pd.read_csv('C:\\Users\\kshit\\Downloads\\file.txt',sep=" ",header=None)
final_data=""
print(data.shape)
for i in range(data.shape[1]):
    final_data+=data.iloc[0][i]+" "
print(final_data)

(1, 27)
One morning as Gregor Samsa was waking up from his anxious dreams, he discovered that in his bed, he had been changed into a monsterous verminous bug. 


In [53]:
words = final_data.split()

In [54]:
words = [x.lower().replace(",","").replace(".","") for x in words]

In [55]:
words

['one',
 'morning',
 'as',
 'gregor',
 'samsa',
 'was',
 'waking',
 'up',
 'from',
 'his',
 'anxious',
 'dreams',
 'he',
 'discovered',
 'that',
 'in',
 'his',
 'bed',
 'he',
 'had',
 'been',
 'changed',
 'into',
 'a',
 'monsterous',
 'verminous',
 'bug']

In [56]:
vocab = set(words)

In [57]:
vocab_size = len(vocab)
vocab_size

25

In [58]:
word_to_idx = {word : i for i, word in enumerate(vocab)}
idx_to_word = {i : word for i, word in enumerate(vocab)}

In [59]:
word_to_idx

{'as': 0,
 'he': 1,
 'dreams': 2,
 'bug': 3,
 'verminous': 4,
 'was': 5,
 'into': 6,
 'samsa': 7,
 'gregor': 8,
 'changed': 9,
 'monsterous': 10,
 'a': 11,
 'from': 12,
 'anxious': 13,
 'bed': 14,
 'his': 15,
 'been': 16,
 'discovered': 17,
 'in': 18,
 'waking': 19,
 'one': 20,
 'up': 21,
 'had': 22,
 'morning': 23,
 'that': 24}

In [60]:
idx_to_word

{0: 'as',
 1: 'he',
 2: 'dreams',
 3: 'bug',
 4: 'verminous',
 5: 'was',
 6: 'into',
 7: 'samsa',
 8: 'gregor',
 9: 'changed',
 10: 'monsterous',
 11: 'a',
 12: 'from',
 13: 'anxious',
 14: 'bed',
 15: 'his',
 16: 'been',
 17: 'discovered',
 18: 'in',
 19: 'waking',
 20: 'one',
 21: 'up',
 22: 'had',
 23: 'morning',
 24: 'that'}

In [61]:
# generate context-word pairs
context_window = 2
embed_dims = 100

In [62]:
x_train=[]
y_train=[]

for index, i in enumerate(words):
    
    print("INDEX and i is",index,i)
    # edge case
    if index <=context_window-1 or index >= len(words)-context_window:
        continue
        
    #sliding context window
    start = index - context_window
    end = index + context_window +1
    
    #here, 2 words left and 2 words right of target word
    context = words[start:end]
    context.remove(i)
    target=i
    
    #data
    x_train.append([word_to_idx[w] for w in context])
    y_train.append([word_to_idx[target]])
print(x_train)
print(y_train)

INDEX and i is 0 one
INDEX and i is 1 morning
INDEX and i is 2 as
INDEX and i is 3 gregor
INDEX and i is 4 samsa
INDEX and i is 5 was
INDEX and i is 6 waking
INDEX and i is 7 up
INDEX and i is 8 from
INDEX and i is 9 his
INDEX and i is 10 anxious
INDEX and i is 11 dreams
INDEX and i is 12 he
INDEX and i is 13 discovered
INDEX and i is 14 that
INDEX and i is 15 in
INDEX and i is 16 his
INDEX and i is 17 bed
INDEX and i is 18 he
INDEX and i is 19 had
INDEX and i is 20 been
INDEX and i is 21 changed
INDEX and i is 22 into
INDEX and i is 23 a
INDEX and i is 24 monsterous
INDEX and i is 25 verminous
INDEX and i is 26 bug
[[20, 23, 8, 7], [23, 0, 7, 5], [0, 8, 5, 19], [8, 7, 19, 21], [7, 5, 21, 12], [5, 19, 12, 15], [19, 21, 15, 13], [21, 12, 13, 2], [12, 15, 2, 1], [15, 13, 1, 17], [13, 2, 17, 24], [2, 1, 24, 18], [1, 17, 18, 15], [17, 24, 15, 14], [24, 18, 14, 1], [18, 15, 1, 22], [15, 14, 22, 16], [14, 1, 16, 9], [1, 22, 9, 6], [22, 16, 6, 11], [16, 9, 11, 10], [9, 6, 10, 4], [6, 11, 4, 3

In [63]:
x_train = np.array(x_train)
y_train = np.array(y_train)

In [64]:
y_train = to_categorical(y_train, num_classes=vocab_size)

In [65]:
# model building
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_dims, input_length=context_window*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_dims,)))
cbow.add(Dense(vocab_size, activation='softmax'))

In [66]:
cbow.compile(loss='categorical_crossentropy', optimizer=SGD(), metrics=['accuracy'])

In [67]:
cbow.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 4, 100)            2500      
                                                                 
 lambda_2 (Lambda)           (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 25)                2525      
                                                                 
Total params: 5,025
Trainable params: 5,025
Non-trainable params: 0
_________________________________________________________________


In [68]:
cbow.fit(x_train, y_train, epochs=5000, batch_size=256)

Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5000
Epoch 45/5000
Epoch 46/5000
Epoch 47/5000
Epoch 48/5000
Epoch 49/5000
Epoch 50/5000
Epoch 51/5000
Epoch 52/5000
Epoch 53/5000
Epoch 54/5000
Epoch 55/5000
Epoch 56/5000
Epoch 57/5000
Epoch 58/5000
Epoch 59/5000
Epoch 60/5000
Epoch 61/5000
Epoch 62/5000
Epoch 63/5000
Epoch 64/5000
Epoch 65/5000
Epoch 66/5000
Epoch 67/5000
Epoch 68/5000
Epoch 69/5000
Epoch 70/5000
Epoch 71/5000
Epoch 72/5000
E

<keras.callbacks.History at 0x29f61431640>

In [69]:
# preds = cbow.predict(x_train)



In [70]:
# preds.shape

(23, 25)

In [71]:

# pred = preds.argmax(axis=1) 
# pred.shape
# print(pred)

[ 0  8  7  5 19 21 12 15 13  2  1 15 24  1 15 14  1 22 16  9  6 11 10]


In [82]:
# i=0
# print(x_train)
context_words=input("give existing words for context").split()
#convert words to index

index_words=[]
for i in context_words:
    index_words.append(word_to_idx[i])
pred2=cbow.predict([index_words])
# print(pred2)
pred_index=pred2.argmax(axis=1)
# print(pred_index)
print(idx_to_word[pred_index[0]])

# for context in x_train:
#     print("...............................")
# print("===========Context :===========")
# for w in context_words:
# #     print(idx_to_word[w])
# print("=========Predicted Target :====")
# print(idx_to_word[pred[i]])
#     i+=1

give existing words for contextone morning gregor samsa
as
