In [1]:
from keras.preprocessing import text
from keras.src.utils import np_utils
from keras.preprocessing  import sequence
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

In [2]:
data = """Deep learning (also known as deep structured learning) is part of a
broader family of machine learning methods based on artificial neural networks
with representation learning. Learning can be supervised, semi-supervised or unsupervised.
Deep-learning architectures such as deep neural networks, deep belief networks,
deep reinforcement learning, recurrent neural networks, convolutional neural networks and
Transformers have been applied to fields including computer vision, speech recognition,
natural language processing, machine translation, bioinformatics, drug design,
medical image analysis, climate science, material inspection and board game programs,
where they have produced results comparable to and in some cases surpassing human expert performance.
"""
dl_data=data.split()

In [5]:
tokenizer =text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id=tokenizer.word_index

word2id['PAD']=0
id2word={v:k for k,v in word2id.items()}
wids=[[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size=len(word2id)
embed_size=100
window_size=2

print('Vocablury_size:',vocab_size)
print('Vocablury Sample:',list(word2id.items())[:10])

Vocablury_size: 75
Vocablury Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


In [20]:
def generate_context_word_pairs(corpus,window_size,vocab_size):
    context_length=window_size*2
    
    for words in corpus:
        sentence_length=len(words)
        for index,word in enumerate(words):
            context_words=[]
            label_words=[]
            start=index-window_size
            end=index+window_size+1
            
        context_words.append([
            words[i]
            for i in range(start,end)
            if 0 <= i <sentence_length
            and i!=index]
        )
        
    label_words.append(words)
    
    x=pad_sequences(context_words,maxlen=context_length)
    y=np_utils.to_categorical(label_words,vocab_size)
    yield(x,y)
    
i=0
for x,y in generate_context_word_pairs(corpus=wids,window_size=window_size,vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
     
#       print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    if i==10:
        break
    i+=1
    
    
    

In [25]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense,Embedding,Lambda

cbow=Sequential()
cbow.add(Embedding(input_dim=vocab_size,output_dim=embed_size,input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x,axis=1),output_shape=(embed_size,)))
cbow.add(Dense(vocab_size,activation='softmax'))
cbow.compile(loss='categorical_crossentropy',optimizer='rmsprop')

cbow.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 4, 100)            7500      
                                                                 
 lambda_2 (Lambda)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 75)                7575      
                                                                 
Total params: 15075 (58.89 KB)
Trainable params: 15075 (58.89 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
for epoch in range(1,6):
    loss=0.
    i=0
    
    for x,y in generate_context_word_pairs(corpus=wids,window_size=window_size,vocab_size=vocab_size):
        i+=1
        loss+=cbow.train_on_batch(x,y)
        if i % 100000 ==0:
            print('Processed {} (context_words) pairs',format(i))
    
    print('Epoch:',epoch, '\tLoss:',loss)
    

Epoch: 1 	Loss: 4.323254108428955
Epoch: 2 	Loss: 4.279575347900391
Epoch: 3 	Loss: 4.246826171875
Epoch: 4 	Loss: 4.218571662902832
Epoch: 5 	Loss: 4.192775249481201


In [29]:
weights=cbow.get_weights()[0]
weights=weights[1:]
print(weights.shape)

pd.DataFrame(weights,index=list(id2word.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
deep,0.027598,0.025616,-0.019534,-0.027853,-0.011096,-0.041121,0.016038,-0.022138,-0.032354,-0.022345,0.017147,-0.045077,-0.042347,0.017261,-0.007474,0.039669,0.029458,0.015225,0.042418,-0.007902,-0.005385,-0.012055,-0.024799,0.028562,-0.000986,-0.002144,-0.008116,-0.037265,-0.022356,0.032604,0.0278,-0.040006,0.030309,-0.020866,0.034473,-0.021448,0.007463,-0.004708,0.008387,0.035009,0.017402,-0.047634,0.034564,-0.023868,0.039767,0.011631,-0.021325,-0.019272,-0.0201,0.019799,-0.045309,-0.027782,-0.014469,0.011189,-0.018399,0.01696,-0.026299,0.027685,0.013918,0.016753,-0.035251,0.033823,0.025375,-0.046113,0.022424,0.008341,0.044617,0.037197,0.004919,-0.019985,0.033839,0.015802,-0.022419,0.028654,0.033786,-0.014357,0.027045,0.025523,0.045973,0.014387,-0.003611,0.003881,-0.048849,0.034216,-0.033115,0.025962,0.035385,-0.048719,-0.014709,-0.047929,0.045986,0.004856,-0.029819,-0.0275,-0.039103,0.020287,-0.001972,-0.018669,-0.048891,0.005336
networks,-0.021143,0.034413,0.023518,0.018158,0.029001,-0.04018,0.00822,-0.03412,0.013153,-0.012267,-0.044774,-0.005408,0.005398,-0.026861,0.023174,-0.038788,0.03284,0.043487,-0.043686,0.040516,0.013629,-0.022286,-0.024876,0.044458,0.023752,-0.018657,-0.00678,-0.035632,-0.004316,-0.023746,-0.010749,-0.00391,0.038316,0.016347,0.009068,0.016674,-0.010182,-0.005328,0.026933,-0.044618,-0.021981,0.019567,0.027864,0.002257,-0.012498,-0.023915,0.034041,0.006925,-0.045906,-0.011711,0.018082,-0.014443,0.027114,0.016944,0.022856,-0.040289,0.042153,-0.020912,0.000492,0.046719,0.009897,-0.034905,0.007009,0.024478,0.047617,-0.012916,-0.01107,-0.002731,0.048119,0.022313,0.009407,-0.04151,-0.010531,0.014759,-0.029879,0.010744,0.037148,0.044667,-0.046681,-0.04288,-0.012753,-0.035796,-0.017842,-0.025125,0.025954,0.047032,0.004037,0.020972,-0.018513,-0.012364,-0.041676,0.005962,-0.011412,-0.034275,0.03494,-0.020806,0.03942,-0.041171,0.019655,0.0491
neural,-0.006308,0.048166,-0.049243,-0.026706,-0.025683,-0.008417,-0.041196,0.039105,0.042512,0.026013,-0.03112,0.04922,-0.009368,0.010416,-0.03646,-0.008143,-0.022974,-0.038073,-0.045128,-0.039188,0.006305,0.047492,-0.014406,0.016801,-0.022442,-0.032547,0.00919,-0.021149,0.012899,0.020178,0.015686,-0.011128,-0.001215,0.021564,0.049293,0.048292,0.004422,-0.037697,0.031497,0.001866,-0.008429,0.015932,0.04231,0.018364,-0.033493,-0.0437,-0.014271,-0.000804,-0.008042,-0.026013,-0.009877,-0.038834,0.041119,0.044256,-0.044802,-0.00616,0.00138,0.000455,-0.04381,-0.011931,0.015713,-0.030288,0.015932,0.041038,-0.011373,0.003866,-0.048982,0.004547,-0.041228,0.024221,-0.034774,-0.038031,0.041526,0.011007,-0.04025,0.004281,-0.02418,0.038996,-0.033406,0.002894,-0.046947,-0.03614,-0.008591,0.020722,0.003987,-0.012003,-0.000523,-0.00563,0.038407,-0.030189,-0.01247,-0.031172,0.007356,0.020743,-0.048718,-0.00801,-0.002326,0.045148,-0.035585,0.006936
and,0.033406,0.007621,-0.033398,0.034437,0.034295,0.037554,-0.024445,0.034618,0.014313,0.037232,0.011763,0.043804,-0.033539,0.017179,0.009803,0.03364,0.041534,0.028493,-0.008497,0.017475,0.012769,0.032182,0.003994,-0.033022,0.018197,0.034944,-0.005193,-0.012367,-0.010196,-0.012087,0.043449,-0.045064,0.009535,0.010337,0.001436,0.021369,-0.01801,0.034441,-0.022676,-0.000191,0.028924,0.037757,0.043106,0.012056,0.043003,0.049594,0.004212,0.036473,0.04896,0.043888,0.016139,0.025027,-0.002773,-0.041604,-0.04785,0.017779,0.049163,-0.021325,-0.021487,-0.032288,-0.000897,0.039794,0.021943,-0.017102,0.040937,-0.010501,0.039564,0.013987,0.017321,0.021,-0.041007,0.037816,0.010417,-0.008848,0.002212,-0.006911,-0.017929,0.038528,-0.004046,-0.022257,-0.043152,-0.021433,-0.029238,0.00534,0.006018,-0.025428,0.042561,0.001026,-0.0074,-0.004099,-0.043067,-0.017311,0.014901,-0.029218,0.008923,0.039222,0.009595,0.040217,0.04709,0.003496
as,0.026456,-0.00938,-0.007531,0.049292,0.028448,0.025297,0.010735,-0.034378,0.012799,-0.012042,0.031363,-0.020835,-0.033664,0.024213,0.027116,-0.041964,-0.035758,0.034098,0.041815,-0.002341,0.02609,0.038108,0.027559,0.001145,0.037158,0.04318,-0.013027,0.031433,0.002387,0.017673,-0.034465,-0.021738,0.037242,-0.042194,-0.034069,0.048242,0.001004,0.021714,0.030948,0.02498,-0.030603,0.000873,-0.04371,0.020196,-0.011339,0.0231,-0.010103,-0.032175,0.044398,-0.04096,0.01413,0.011797,0.020635,-0.002169,-0.046147,0.027577,-0.028249,-0.042248,-0.039442,0.030436,0.035144,0.002603,-0.009521,0.002839,0.044811,-0.049434,-0.049056,0.037574,0.031684,-0.033714,-0.00592,0.024901,0.014483,-0.041641,-0.049367,-0.000613,0.004838,-0.024374,-0.033999,0.029219,0.034063,0.037006,0.009846,0.030278,0.015962,-0.012003,-0.039093,0.038928,-0.037269,0.022601,-0.036158,0.040125,0.020603,0.04766,-0.024149,-0.018883,-0.008196,0.034286,0.041437,-0.038135


In [30]:
from sklearn.metrics.pairwise import euclidean_distances
distance_matrix=euclidean_distances(weights)
print(distance_matrix.shape)

similiar_wprds={search_term :[id2word[idx]for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
               for search_term in ['deep']
               }
similiar_wprds


(74, 74)


{'deep': ['vision', 'programs', 'broader', 'known', 'family']}