
## Implement the Continuous Bag of Words (CBOW) Model for the given (textual document) using the below steps:
    a. Data preparation
    b. Generate training data
    c. Train model
    d. Output

In [1]:
data ="""But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain 
was born and I will give you a complete account of the system, and expound the actual teachings 
of the great explorer of the truth, the master-builder of human happiness. No one rejects, 
dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know 
how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again 
is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, 
but because occasionally circumstances occur in which toil and pain can procure him some great 
pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, 
except to obtain some advantage from it? But who has any right to find fault with a man who 
chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that 
produces no resultant pleasure?"""

data = data.split()

In [2]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)

word2id = tokenizer.word_index
word2id['PAD'] = 0

id2word = {v:k for k,v in word2id.items()}
wids = tokenizer.texts_to_sequences(data)

emb_size = 100
window_size = 2
vocab_size = len(word2id)

In [25]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


In [26]:
def cbow_model(corpus,vocab_size, window_size):
    context_length = window_size*2
    for words in corpus:
        sequences_size = len(words)
        for index,word in enumerate(words):
            context_word = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1
            context_word.append([words[i]
                               for i in range(start,end)
                               if 0<=i <sequences_size
                               and i!=index])
            label_word.append(word)
            
            x = pad_sequences(context_word,context_length)
            y = to_categorical(label_word,vocab_size)
            yield(x,y)
            
 

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Lambda
import keras.backend as K

In [28]:
cbow = Sequential([
    Embedding(vocab_size,emb_size,input_length = window_size*2),
    Lambda(lambda x:K.mean(x,axis=1)),
    Dense(vocab_size,activation = 'softmax')
])

cbow.compile(loss='categorical_crossentropy', optimizer='adam')
cbow.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 4, 100)            10200     
                                                                 
 lambda_3 (Lambda)           (None, 100)               0         
                                                                 
 dense_3 (Dense)             (None, 102)               10302     
                                                                 
Total params: 20502 (80.09 KB)
Trainable params: 20502 (80.09 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
for epochs in range(5):
    loss  = 0
    for x,y in cbow_model(corpus=wids,vocab_size = vocab_size,window_size=window_size):
        loss += cbow.train_on_batch(x,y)
    print("Epochs {} - Loss -> {}".format(epochs,loss))

2023-11-09 23:06:45.484826: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epochs 0 - Loss -> 776.0845975875854
Epochs 1 - Loss -> 762.3211803436279
Epochs 2 - Loss -> 751.9022114276886
Epochs 3 - Loss -> 743.8112494945526
Epochs 4 - Loss -> 739.9156000614166


In [30]:
import pandas as pd
weights = cbow.get_weights()[0][:]
# pd.DataFrame(weights,index=word2id.keys())

In [31]:
### from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import euclidean_distances

distance_matrix = euclidean_distances(weights)
data = pd.DataFrame(distance_matrix,index=word2id.keys())
data.columns = word2id.keys()

data

Unnamed: 0,to,of,pleasure,pain,a,the,who,but,and,or,...,find,fault,with,man,chooses,enjoy,annoying,produces,resultant,PAD
to,0.000000,0.807442,0.771928,0.793368,0.810595,0.788211,0.822126,0.831010,0.783233,0.807467,...,0.786745,0.806996,0.813065,0.825786,0.842597,0.795304,0.831060,0.838407,0.821537,0.812710
of,0.807442,0.000000,0.371872,0.384705,0.397729,0.372196,0.430555,0.394984,0.383982,0.382689,...,0.423424,0.413547,0.416344,0.426392,0.404667,0.397039,0.404023,0.383512,0.406284,0.354068
pleasure,0.771928,0.371872,0.000000,0.402437,0.453952,0.387958,0.434515,0.397301,0.405038,0.425939,...,0.415794,0.401628,0.429888,0.423150,0.434478,0.390972,0.412803,0.426508,0.434454,0.375738
pain,0.793368,0.384705,0.402437,0.000000,0.386233,0.423150,0.408687,0.392622,0.404882,0.423993,...,0.431200,0.418583,0.365058,0.427527,0.366061,0.387408,0.389884,0.429357,0.409372,0.390907
a,0.810595,0.397729,0.453952,0.386233,0.000000,0.436693,0.389560,0.419061,0.371066,0.443719,...,0.454170,0.409596,0.373956,0.427422,0.432662,0.402466,0.431611,0.432856,0.438219,0.416665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
enjoy,0.795304,0.397039,0.390972,0.387408,0.402466,0.364225,0.401218,0.375372,0.364863,0.409672,...,0.389308,0.413979,0.406275,0.357450,0.409172,0.000000,0.374033,0.387794,0.380821,0.408744
annoying,0.831060,0.404023,0.412803,0.389884,0.431611,0.406568,0.435757,0.343474,0.374915,0.401215,...,0.426050,0.372907,0.388981,0.414288,0.383504,0.374033,0.000000,0.404567,0.377813,0.385093
produces,0.838407,0.383512,0.426508,0.429357,0.432856,0.425198,0.429420,0.428291,0.427918,0.449116,...,0.408207,0.436525,0.410676,0.418588,0.401932,0.387794,0.404567,0.000000,0.417281,0.409681
resultant,0.821537,0.406284,0.434454,0.409372,0.438219,0.420402,0.417078,0.416733,0.383024,0.390505,...,0.434330,0.432945,0.393465,0.397161,0.407885,0.380821,0.377813,0.417281,0.000000,0.398061


In [32]:
def SearchWord(WordList):
    similar_words ={}
    for search_term in WordList:
        if(search_term in word2id.keys()):
            similar_words[search_term]=[id2word[idx] for idx in 
                                        distance_matrix[word2id[search_term]-1].argsort()[0:5]+1] 
    return similar_words



In [33]:
SearchWord(['enjoy'])

{'enjoy': ['enjoy', 'us', 'laborious', 'all', 'teachings']}