In [2]:
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import pandas as pd

In [3]:
#with open("datasets/CBOW.txt", "r", encoding="utf-8") as f:
#    data = f.read()
data = """But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?"""

dl_data = data.split()

In [4]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(dl_data)

words2id=tokenizer.word_index
words2id['PAD']=0

id2words={v:k for k,v in words2id.items()}

wids=[[words2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size=len(words2id)
embed_size=100
window_size=2

print("Vocabulary size: ", vocab_size)
print("Vocabulary Sample: ", list(words2id.items())[:10])

Vocabulary size:  102
Vocabulary Sample:  [('to', 1), ('of', 2), ('pleasure', 3), ('pain', 4), ('a', 5), ('the', 6), ('who', 7), ('but', 8), ('and', 9), ('or', 10)]


In [5]:
def generate_context_word_pair(corpus, window_size, vocab_size):
    context_length=window_size*2

    for words in corpus:
        sentence_length = len(words)

        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i]
                                 for i in range(start, end)
                                 if 0 <=i <sentence_length
                                 and i!= index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen = context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x,y)

In [6]:
cbow=Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x:K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation="softmax"))


cbow.compile(loss="categorical_crossentropy", optimizer="rmsprop")
print(cbow.summary())



None


In [7]:
for epoch in range(1,6):
    loss=0
    i=0

    for x,y in generate_context_word_pair(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i+=1
        loss+=cbow.train_on_batch(x,y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:',epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 773.108389377594

Epoch: 2 	Loss: 769.4349451065063

Epoch: 3 	Loss: 763.7685389518738

Epoch: 4 	Loss: 759.4288697242737

Epoch: 5 	Loss: 756.4103488922119



In [8]:
weights=cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2words.values())[1:]).head()

(101, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
of,0.000945,0.007332,-0.017109,-0.008793,0.0422,0.006562,0.047575,-0.04892,-0.001053,0.021806,...,-0.021545,0.018038,-0.003941,0.049054,-0.044174,-0.044231,-0.000124,0.036273,0.016982,-0.034461
pleasure,0.048054,0.012615,-0.035738,-0.025877,0.031358,-0.023437,-0.028893,0.03314,-0.019365,-0.009109,...,-0.024537,-0.005974,-0.006251,0.030114,0.048228,0.004865,-0.010876,-0.026926,0.022205,-0.024876
pain,-0.038475,0.038592,0.041136,-0.012256,-0.002713,-0.047016,0.024464,-0.030772,0.028342,-0.011487,...,-0.039035,-0.015259,0.013181,-0.021164,-0.011017,-0.014501,0.044841,0.034023,0.030266,-0.04178
a,-0.043775,0.013458,0.02986,0.048538,0.017152,0.020422,0.029465,-0.020121,-0.019172,-0.013137,...,-0.021189,-0.04751,-0.047732,0.007861,-0.044197,-0.037552,-0.022611,0.024863,-0.008149,-0.02317
the,-0.025888,0.00298,0.043403,0.001816,-0.009291,-0.019799,0.042928,0.038877,-0.034306,-0.043,...,0.004512,-0.043576,0.005758,0.048588,-0.016311,-0.036759,-0.020628,-0.0272,-0.045592,-0.04256


In [9]:
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

(101, 101)


In [11]:
inwords = input()

similar_words={ search_term: [id2words[idx] for idx in distance_matrix[words2id[search_term]-1].argsort()[0:6]]
              for search_term in {inwords}}
print(similar_words)

 teachings


{'teachings': ['actual', 'dislikes', 'not', 'rationally', 'nor', 'which']}
