In [73]:
#importing libraries
# importing libraries
from tensorflow.keras.preprocessing import text
from tensorflow.keras.utils import to_categorical  # replaces np_utils
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import pad_sequences
import numpy as np
import pandas as pd


In [75]:
ata = """Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. 
Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.
"""

# Tokenization
dl_data = data.split()


In [77]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v: k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 75
Vocabulary Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


In [79]:
# Generating (context word, target/label word) pairs
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x, y)
            
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        #print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argmax(y[0])])
    
        if i == 10:
            break
        i += 1

In [81]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
import tensorflow as tf  # Import TensorFlow for reduce_mean

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: tf.reduce_mean(x, axis=1), output_shape=(embed_size,)))  # Corrected line
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.build((None, window_size * 2))  # Explicitly build the model with the expected input shape
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print(cbow.summary())


None


In [89]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()


Epoch: 1 	Loss: 431.1688380241394

Epoch: 2 	Loss: 430.8469548225403

Epoch: 3 	Loss: 428.9790954589844

Epoch: 4 	Loss: 427.1851363182068

Epoch: 5 	Loss: 425.669237613678



In [90]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,0.029655,0.022873,-0.015262,-0.023544,-0.054531,0.005922,0.02601,0.022788,-0.063148,0.02118,...,-0.041188,0.027907,0.007141,-0.028975,0.025704,0.03939,-0.016344,-0.042272,0.011556,0.021207
networks,0.041737,0.026283,0.004711,0.058627,0.033102,-0.000229,-0.022815,-0.007544,0.019047,0.048183,...,-0.053341,0.026997,-0.030934,-0.056822,-0.033432,0.028579,0.041693,-0.011602,-0.045532,-0.038753
neural,0.006106,0.030483,-0.0095,0.042318,0.048228,0.045328,0.036498,0.004088,-0.040926,0.033485,...,-0.014414,0.022468,-0.006091,0.006012,0.0274,0.034097,0.00519,-0.046414,-0.005619,0.019854
and,-0.012251,0.034479,-0.033184,0.039768,0.004223,0.022133,-0.022103,-0.027806,-0.031485,0.036963,...,0.04404,-0.008037,0.002038,0.006562,0.040219,-0.021924,-0.018883,0.04244,-0.041223,0.017965
as,-0.007139,0.007471,-0.021031,0.012144,0.047927,-0.020941,-0.004091,0.013818,-0.042243,0.021575,...,0.032584,-0.00747,-0.025956,-0.004092,-0.023335,0.031788,0.025074,-0.012865,-0.043434,0.000664


In [91]:
from sklearn.metrics.pairwise import euclidean_distances

distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['deep']}

similar_words

(74, 74)


{'deep': ['representation', 'reinforcement', 'on', 'vision', 'speech']}