In [1]:
from keras.preprocessing import text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams
import pandas as pd
import numpy as np

In [2]:
corpus = ['Today is a good day for taking a walk']
print(f'Corpus Length: {len(corpus)}')

tokeniser = Tokenizer()  # tokeniser initialisation
tokeniser.fit_on_texts(corpus)  # fit tokeniser on corpus (list of strings)
vocab_size = len(tokeniser.word_index) + 1

# tokeniser.word_index - unique words (word,index) dictionary
# text.text_to_word_sequence - tokenise string
# text.text_to_sequences - tokenised numerisation

word2id = tokeniser.word_index; print(word2id) # tokens to id
id2word = {v:k for k, v in word2id.items()} # id to token

embed_size = 10 # size of embeddings
epochs = 40

# tokenise and convert token to unique number id
tokens = [[w for w in text.text_to_word_sequence(doc)] for doc in corpus]
numerical_id = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in corpus]

print('\ntokens')
print(tokens)
print('\nnumerical representation of tokens')
print(numerical_id)

Corpus Length: 1
{'a': 1, 'today': 2, 'is': 3, 'good': 4, 'day': 5, 'for': 6, 'taking': 7, 'walk': 8}

tokens
[['today', 'is', 'a', 'good', 'day', 'for', 'taking', 'a', 'walk']]

numerical representation of tokens
[[2, 3, 1, 4, 5, 6, 7, 1, 8]]


In [3]:
''' Define SG Model '''
# requires two separate input vectors; word and context

from tensorflow.keras.layers import Dot, Dense, Reshape, Embedding
from tensorflow.keras.models import Sequential, Model

# word
word_model = Sequential()
word_model.add(Embedding(vocab_size, embed_size,input_length=1))
word_model.add(Reshape((embed_size, )))      # [1,embed_dim] -> [embed_dim]

# context 
context_model = Sequential()
context_model.add(Embedding(vocab_size, embed_size,input_length=1))
context_model.add(Reshape((embed_size,)))

# dot product of both embed vectors
model_arch = Dot(axes=1)([word_model.output, context_model.output]) 
model_arch = Dense(1,activation="sigmoid")(model_arch)

model = Model([word_model.input,
               context_model.input], model_arch)

model.compile(loss="mean_squared_error",
              optimizer="rmsprop")

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 embedding_input (InputLayer)   [(None, 1)]          0           []                               
                                                                                                  
 embedding_1_input (InputLayer)  [(None, 1)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 10)        90          ['embedding_input[0][0]']        
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 10)        90          ['embedding_1_input[0][0]']      
                                                                                              

In [4]:
lst_loss = []
for epoch in range(epochs):
    
    loss = 0.0
    
    # Enumerate over tokenised text
    for i, doc in enumerate(tokeniser.texts_to_sequences(corpus)):
        
        # create training samples
        # data - list of [word,context] , label (next to one another)
        
        data, labels = skipgrams(sequence=doc,   
                                 vocabulary_size=vocab_size, 
                                 window_size=2,
                                 shuffle=True)
        
        x = [np.array(x) for x in zip(*data)] # word, context vectors 
        y = np.array(labels, dtype=np.int32)  # label (words are next to each other)
        
        if x:
            loss += model.train_on_batch(x, y)
            
    lst_loss.append(loss)

In [5]:
print('Last Iteration Word Embedding:')
word_embed_layer = model.layers[2]
print('vocab size:', vocab_size)
word_embed_layer.get_weights()[0].shape

weights = word_embed_layer.get_weights()[0][1:]
pd.DataFrame(weights, index=id2word.values()).head()

Last Iteration Word Embedding:
vocab size: 9


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
a,-0.011746,0.006677,-0.013652,0.00986,-0.039301,-0.041331,0.010881,-0.011189,0.014499,0.026029
today,0.015985,0.009962,0.034722,-0.03414,-0.011411,-0.050048,0.052077,-0.06704,-0.00349,0.021565
is,0.06974,-0.066058,-0.012894,0.023048,0.006557,0.048443,-0.070513,0.004748,-0.017071,-0.018088
good,-0.019644,-0.045336,-0.011896,-0.058437,0.046689,-0.043507,0.011198,0.026951,-0.003062,0.0198
day,-0.04694,-0.071056,-0.0058,-0.033562,-0.058224,0.01128,0.023172,0.018498,-0.012863,-0.001334
