In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing import text
from keras.preprocessing.sequence import pad_sequences
from keras.src.utils import np_utils
import numpy as np
import pandas as pd

In [2]:
data = """Deep learning (also known as deep structured learning) is part of a
broader family of machine learning methods based on artificial neural networks
with representation learning. Learning can be supervised, semi-supervised or unsupervised.
Deep-learning architectures such as deep neural networks, deep belief networks,
deep reinforcement learning, recurrent neural networks, convolutional neural networks and
Transformers have been applied to fields including computer vision, speech recognition,
natural language processing, machine translation, bioinformatics, drug design,
medical image analysis, climate science, material inspection and board game programs,
where they have produced results comparable to and in some cases surpassing human expert performance.
"""

dl_data=data.split()

In [14]:
tokenizer =text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id=tokenizer.word_index

word2id['PAD']=0
id2word={v:k for k,v in word2id.items()}
wids=[[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]
         
vocab_size=len(word2id)
embed_size=100
window_size=2

print('Vovablury Size: ',vocab_size)
print('Vocablury Sample: ',list(word2id.items())[:10])

Vovablury Size:  75
Vocablury Sample:  [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


In [18]:
def generate_context_words(corpus,window_size,vocab_size):
    context_length=window_size*2
    
    for words in corpus:
        sentence_length=len(words)
        for word,index in enumerate(words):
            context_words=[]
            label_words=[]
            start=index-window_size
            end=index+window_size+1
            context_words.append([
                words[i]
                for i in range(start,end)
                if 0 <= i <sentence_length
                and i!=index
]            )
            label_words.append(words)

            x=pad_sequences(context_words,maxlen=context_length)
            y=np_utils.to_categorical(label_words,vocab_size)
            yield(x,y)

i=0

for x,y in generate_context_words(corpus=wids,window_size=window_size,vocab_size=vocab_size):
    if 0 not in x[0]:
        
        if i==10:
            break
        i+=1
        

In [22]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense,Flatten,Embedding,Lambda

cbow=Sequential()
cbow.add(Embedding(input_dim=vocab_size,output_dim=embed_size,input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x,axis=1),output_shape=(embed_size,)))
cbow.add(Dense(vocab_size,activation='relu'))
cbow.compile(loss='categorical_crossentropy',optimizer='rmsprop')

cbow.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 4, 100)            7500      
                                                                 
 lambda_2 (Lambda)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 75)                7575      
                                                                 
Total params: 15075 (58.89 KB)
Trainable params: 15075 (58.89 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
for epoch in range(1,6):
    loss=0.
    i=0
    
    for x, y in generate_context_words(corpus=wids,window_size=window_size,vocab_size=vocab_size):
        i+=1
        loss+=cbow.train_on_batch(x,y)
        
        if i % 100000 ==0:
            print('Processed {} (context_word) pairs',format(i))
    print('Epoch:',epoch,'\tLoss:',loss)
#     print()

InvalidArgumentError: Graph execution error:

Detected at node categorical_crossentropy/cond/remove_squeezable_dimensions/cond/Squeeze defined at (most recent call last):
  File "C:\Users\Dell\anaconda3\lib\runpy.py", line 197, in _run_module_as_main

  File "C:\Users\Dell\anaconda3\lib\runpy.py", line 87, in _run_code

  File "C:\Users\Dell\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>

  File "C:\Users\Dell\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance

  File "C:\Users\Dell\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start

  File "C:\Users\Dell\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start

  File "C:\Users\Dell\anaconda3\lib\asyncio\base_events.py", line 601, in run_forever

  File "C:\Users\Dell\anaconda3\lib\asyncio\base_events.py", line 1905, in _run_once

  File "C:\Users\Dell\anaconda3\lib\asyncio\events.py", line 80, in _run

  File "C:\Users\Dell\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 471, in dispatch_queue

  File "C:\Users\Dell\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 460, in process_one

  File "C:\Users\Dell\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 367, in dispatch_shell

  File "C:\Users\Dell\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 662, in execute_request

  File "C:\Users\Dell\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 360, in do_execute

  File "C:\Users\Dell\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell

  File "C:\Users\Dell\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2863, in run_cell

  File "C:\Users\Dell\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2909, in _run_cell

  File "C:\Users\Dell\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\Dell\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3106, in run_cell_async

  File "C:\Users\Dell\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3309, in run_ast_nodes

  File "C:\Users\Dell\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3369, in run_code

  File "C:\Users\Dell\AppData\Local\Temp\ipykernel_21444\3055759913.py", line 7, in <cell line: 1>

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\engine\training.py", line 2763, in train_on_batch

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1377, in train_function

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1360, in step_function

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1349, in run_step

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1127, in train_step

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1185, in compute_loss

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\engine\compile_utils.py", line 277, in __call__

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\losses.py", line 143, in __call__

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\losses.py", line 263, in call

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\utils\losses_utils.py", line 209, in squeeze_or_expand_dimensions

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\utils\losses_utils.py", line 204, in <lambda>

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\utils\losses_utils.py", line 155, in remove_squeezable_dimensions

  File "C:\Users\Dell\anaconda3\lib\site-packages\keras\src\utils\losses_utils.py", line 157, in <lambda>

Can not squeeze dim[2], expected a dimension of 1, got 75
	 [[{{node categorical_crossentropy/cond/remove_squeezable_dimensions/cond/Squeeze}}]] [Op:__inference_train_function_1293]

In [32]:
weights=cbow.get_weights()[0]
weights=weights[1:]
print(weights.shape)

pd.DataFrame(weights,index=list(id2word.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
deep,-0.019836,0.035901,-0.037689,-0.043634,-0.003655,0.03884,0.027283,-0.013693,-0.010249,0.001092,0.043523,-0.049723,0.025804,0.042881,0.032975,-0.022084,0.033041,-0.003166,0.030577,-0.009308,0.014399,0.018277,0.024595,-0.041799,-0.008205,-0.009811,-0.045999,0.003028,-0.048329,0.009343,0.034637,0.00293,0.039431,-0.034458,-0.039591,0.014165,0.006138,0.035324,0.040022,0.018841,-0.049872,0.038758,-0.037766,0.01749,0.034356,-0.002149,-0.040055,-0.048888,-0.049978,0.033784,-0.001637,-0.018569,-0.028933,0.044266,-0.02896,0.047482,-0.028468,-0.048813,0.044663,0.024354,-0.045253,-0.049627,0.01272,-0.048894,-0.016077,0.016247,0.031659,0.023376,-0.003065,0.041534,-0.040173,0.01152,0.01082,-0.016647,0.035602,0.037392,-0.016338,-0.046125,0.030527,0.0484,-0.040301,0.045313,-0.021557,0.026521,-7.5e-05,0.03481,-0.015468,-0.004885,0.000473,-0.049026,-0.037189,-0.035134,-0.017806,0.041377,0.049775,-0.035659,0.028839,-0.030406,-0.025793,0.018365
networks,0.00572,0.016231,0.049594,-0.028568,-0.042277,0.000279,-0.051359,0.047919,0.01603,-0.028038,-0.012239,0.022803,0.004344,-0.003842,-0.056391,0.006244,-0.002506,-0.032568,-0.033393,0.059245,0.057607,0.022649,0.015031,-0.050173,-0.015436,0.063541,0.029025,-0.018873,-0.024845,0.021696,-0.021525,-0.023892,-0.013428,-0.02873,0.024717,0.021778,-0.052787,-0.011139,-0.004403,0.016791,0.038254,0.013332,0.007147,-0.025811,-0.03899,-0.056724,0.040159,-0.051886,0.025847,-0.044599,-0.065946,0.001907,0.039436,0.017643,-0.045387,-0.031101,0.003298,0.044277,0.018405,-0.025416,-0.011625,0.011224,-0.048548,0.065258,0.020879,-0.041625,0.000127,-0.064158,0.038533,0.011017,-0.046571,-0.059844,-0.021362,0.044651,0.037217,-0.007762,-0.006987,-0.019348,0.034827,-0.068607,-0.006009,0.052724,-0.017666,0.027001,0.044444,-0.008179,-0.057694,0.03973,0.025904,0.007628,-0.033238,0.010173,-0.007707,0.036942,0.037498,0.030019,0.016355,-0.021899,-0.028389,-0.042821
neural,0.002713,-0.032696,0.039049,0.0391,0.013145,0.032703,-0.002355,0.020846,-0.027544,-0.022652,-0.039298,0.021408,-0.001152,0.000627,-0.040411,-0.027999,-0.04809,0.044496,-0.001525,-0.027806,-0.003458,-0.036425,0.048567,0.012877,-0.028299,0.028496,-0.017757,-0.021154,0.048874,-0.008383,0.014118,0.04967,0.039892,-0.041856,0.046396,0.040845,0.003689,-0.020222,-0.046909,-0.015277,0.0342,0.027597,0.027587,0.015738,0.046009,0.012805,0.043899,-0.039498,0.004138,-0.03555,0.015528,-0.015486,0.010307,-0.008183,0.018442,0.008571,0.039896,0.01237,-0.005839,-0.009616,-0.0449,-0.025565,0.020509,0.04535,0.036147,-0.008209,0.009581,0.024765,-0.042853,-0.015782,-0.021512,0.047695,-0.047484,0.048873,-0.037857,-0.018315,0.018537,0.048381,-0.028365,0.013377,0.019258,-0.008675,0.014591,-0.013699,0.003204,-0.027527,0.009591,0.044377,-0.008335,0.031633,-0.003067,-0.006781,0.036851,-0.042338,-0.037236,-0.019051,-0.02343,-0.000844,-0.033484,-0.045682
and,-0.012596,0.02434,0.031935,-0.009512,-0.040552,0.042698,0.02398,-0.033016,-0.044623,0.028805,-0.000516,0.026436,-0.046534,-0.023277,-0.049617,0.003876,0.009252,0.005645,0.049066,-0.006778,-0.037316,0.038057,0.032607,-0.001337,0.045195,0.044072,0.013456,-0.008257,-0.001914,0.010204,-0.006449,0.035152,0.037239,-0.007844,0.044627,-0.005111,0.045143,-0.005084,-0.04003,-0.001087,-0.031373,0.013372,0.032305,0.00258,-0.00428,-0.033212,-0.023419,-0.006482,0.023295,0.033789,-0.032961,0.028044,-0.005653,0.003819,-0.020265,-0.032478,-0.048869,0.045396,-0.019904,0.019718,-0.037766,0.018224,0.043657,0.039685,-0.001934,-0.034051,0.028093,-0.005655,0.045471,-0.040398,0.026695,-0.020202,-0.037254,-0.045913,0.047823,-0.02613,0.018752,0.031545,-0.011799,-0.014549,-0.009017,-0.016945,-0.0286,-0.007899,-0.030165,0.033877,0.020627,-0.042876,0.046135,0.009899,-0.024637,0.027692,0.011174,-0.027279,0.046649,-0.018214,0.030586,0.04467,-0.002029,0.030041
as,0.036352,-0.012028,-0.011811,0.022789,0.02755,-0.035723,-0.023171,0.016023,0.031478,0.044546,-0.013018,0.03513,0.029794,-0.045795,-0.015462,-0.03313,-0.043919,-0.028301,-0.002372,-0.031995,0.017874,0.02553,0.040708,-0.005505,0.00575,0.014674,-0.044473,-0.03708,-0.046647,-0.00993,0.028568,0.038796,-0.044831,-0.031606,-0.019775,-0.019183,-0.035083,0.028777,-0.002709,-0.003444,0.036109,0.009713,0.016128,-0.028429,-0.028146,-0.034147,-0.023825,-0.006465,0.022248,0.015696,-0.021688,0.009493,-0.001289,0.022878,-0.018367,-0.003684,-0.016164,-0.042252,-0.004893,-0.026732,-0.02966,0.015896,0.006497,-0.007862,-0.021772,-0.011312,-0.040107,0.045146,-0.001495,-0.017547,-0.026699,-0.034726,0.001247,-0.028771,-0.023828,-0.002326,0.002531,-0.017407,0.033132,0.017024,0.014378,0.003083,-0.032275,0.036595,-0.04484,-0.030885,0.023727,-0.018164,-0.041512,0.006804,0.02538,0.034046,0.044578,-0.049485,0.045974,0.03297,-0.004853,0.011186,0.017504,-0.01358


In [33]:
from sklearn.metrics.pairwise import euclidean_distances
distance_matrix=euclidean_distances(weights)
print(distance_matrix.shape)

similiar_words={search_term:[id2word[idx]for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
               for search_term in ['deep']
               }

similiar_words

(74, 74)


{'deep': ['belief', 'have', 'family', 'inspection', 'semi']}