**Name** : Rasika Rajendra Ghadge

**Roll No** : BEITB91

**PRN** : 72139663B

**Class** : BE IT B

### Aim
Implement the Continuous Bag of Words (CBOW) Model. Stages can be: 
1. Data preparation 
2. Generate training data 
3. Train model 
4. Output  

### Import required libraries

In [None]:
# import os
# os.environ["PATH"] += os.pathsep + "/usr/bin/dot"

In [19]:
from keras.preprocessing import text
from keras.src.utils.np_utils import to_categorical
from keras.preprocessing import sequence

data=open('.\corona.txt','r')

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)
word2id = tokenizer.word_index

# build vocabulary of unique words
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

print("wids", wids)

Vocabulary Size: 103
Vocabulary Sample: [('the', 1), ('of', 2), ('influenza', 3), ('covid', 4), ('19', 5), ('virus', 6), ('for', 7), ('transmission', 8), ('is', 9), ('to', 10)]
wids []


#### Function that generates pairs of the context words and the target words

In [14]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x, y)
            
            
# Test this out for some samples
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    print(wids)
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1
        

### Build the model

In [4]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

# build CBOW architecture
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            10300     
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 103)               10403     
                                                                 
Total params: 20703 (80.87 KB)
Trainable params: 20703 (80.87 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


### Train the model

In [17]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    print("wids", wids)
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        print("working")
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

wids []
wids []
wids []
wids []
wids []


### Get word embeddings

In [6]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

import pandas as pd
pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(102, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
of,-0.012557,-0.000489,-0.007461,-0.027918,0.030727,0.042697,-0.012129,-0.034516,-0.016602,-0.042653,...,0.038842,-0.044343,0.023791,0.012583,-0.033218,0.025297,-0.032556,0.005456,0.009999,0.027172
influenza,-0.007105,-0.040333,-0.024819,0.004046,-0.03186,0.00546,-0.034681,-0.043269,0.020203,0.016686,...,0.029064,0.036673,-0.009119,-0.00132,-0.011682,0.00655,0.044756,-0.049734,0.010074,0.049638
covid,0.046671,-0.029772,-0.017857,-0.047143,0.034568,-0.001688,0.033959,0.026106,0.045899,0.041789,...,0.035479,0.006482,0.018229,0.003817,-0.009278,-0.035821,-0.049479,0.036283,-0.023617,0.02172
19,-0.020916,-0.014346,0.014118,-0.025752,0.002652,-0.016447,0.047481,-0.027271,0.035732,0.031675,...,0.026499,-0.035445,0.016975,0.028401,-0.045361,0.041227,0.028149,-0.01644,-0.027975,0.049865
virus,0.04484,-0.029251,-0.014245,-0.018827,0.038892,0.022432,-0.006766,0.014069,-0.016519,-0.017933,...,-0.00668,0.026598,0.00094,0.022429,0.020573,-0.049636,-0.049444,-0.041092,0.027222,0.030606


### Similar words

In [7]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['virus', 'influenza', 'covid']}

similar_words

(102, 102)


{'virus': ['be', 'hours', 'an', '6', 'appear'],
 'influenza': ['6', 'there', 'this', 'secondary', '19'],
 'covid': ['two', 'there', 'cases', 'driver', 'not']}