In [2]:
!pip install np_utils



You should consider upgrading via the 'c:\users\saura\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [3]:
from keras.preprocessing import text
from keras.utils import to_categorical  # Fix the import statement
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

In [4]:
# Load the textual document
with open('CBOW.txt', 'r') as file:
    data = file.read()

In [5]:
dl_data = data.split()

In [11]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v: k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size = len(word2id)
embed_size = 100  # Adjust the embedding size if needed
window_size = 2  # Adjust the window size if needed

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 103
Vocabulary Sample: [('the', 1), ('of', 2), ('influenza', 3), ('covid', 4), ('19', 5), ('virus', 6), ('for', 7), ('transmission', 8), ('is', 9), ('to', 10)]


In [13]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i]
                                 for i in range(start, end)
                                 if 0 <= i < sentence_length
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x, y)

i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])

        if i == 10:
            break
        i += 1

In [14]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print(cbow.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            10300     
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 103)               10403     
                                                                 
Total params: 20703 (80.87 KB)
Trainable params: 20703 (80.87 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [15]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()


Epoch: 1 	Loss: 915.5708479881287

Epoch: 2 	Loss: 893.9244341850281

Epoch: 3 	Loss: 878.7238686084747

Epoch: 4 	Loss: 872.4465320110321

Epoch: 5 	Loss: 868.1396679878235



In [16]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(102, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
of,-0.034204,0.009385,0.009265,0.035797,-0.04574,-0.046577,-0.025,0.022489,0.002309,-0.00266,...,0.039882,0.00909,0.02917,0.014789,0.028542,0.049953,-0.012223,0.015091,-0.008936,-0.007734
influenza,-0.02486,-0.016607,0.015554,0.039862,-0.044827,0.024425,-0.04303,-0.025791,0.01591,-0.01601,...,-0.024955,0.018732,0.045781,-0.01628,-0.001793,-0.006586,-0.020353,-0.038376,0.007897,-0.010935
covid,-0.042248,-0.019678,0.020309,0.021845,0.017347,-0.009131,-0.005761,0.020527,0.015631,0.031679,...,-0.04562,0.034201,-0.001299,0.000574,0.031591,0.032885,0.020088,0.012573,-0.006289,-0.025572
19,-0.066115,0.123826,0.113197,-0.127321,-0.07526,0.116692,0.075387,0.124175,0.096808,-0.101364,...,0.047573,-0.123332,-0.084015,-0.106566,0.051843,-0.129446,-0.133348,-0.09453,0.062494,0.124289
virus,-0.052228,0.103197,0.121841,-0.076764,0.066333,0.101783,0.08414,-0.080516,0.109187,0.042763,...,0.057668,-0.101084,0.086959,0.067987,-0.064813,-0.043414,-0.129842,-0.134527,-0.097154,-0.06261


In [18]:
from sklearn.metrics.pairwise import euclidean_distances

distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
                   for search_term in ['of']}

similar_words

(102, 102)


{'of': ['appear', 'spread', 'one', 'symptoms', 'context']}