In [16]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import numpy as np
import pandas as pd

In [17]:
data=open('sample.txt','r')


In [19]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 9092
Vocabulary Sample: [('0', 1), ('05', 2), ('2', 3), ('4', 4), ('026785005', 5), ('5', 6), ('7', 7), ('014616322', 8), ('8', 9), ('3', 10)]


In [20]:
#generating (context word, target/label word) pairs
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)
            
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        # print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

In [21]:
#model building
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print(cbow.summary())

# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot

# SVG(model_to_dot(cbow, show_shapes=True, show_layer_names=False, rankdir='TB').create(prog='dot', format='svg'))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 100)            909200    
                                                                 
 lambda_1 (Lambda)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 9092)              918292    
                                                                 
Total params: 1,827,492
Trainable params: 1,827,492
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 0.0

Epoch: 2 	Loss: 0.0

Epoch: 3 	Loss: 0.0

Epoch: 4 	Loss: 0.0

Epoch: 5 	Loss: 0.0



In [23]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(9091, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
5,-0.048595,0.042882,0.001355,-0.00998,0.011347,-0.03004,-0.030531,0.017747,-0.004096,0.029786,...,0.031022,0.006113,-0.027673,0.047356,0.045592,-0.020552,0.018189,0.000852,-0.020196,-0.032795
2,-0.01057,0.003634,0.042399,0.0301,-0.000139,0.013415,0.034569,-0.043842,0.01281,-0.013149,...,0.000379,0.02233,0.028164,-0.036036,0.0017,0.034555,-0.000623,-0.025676,0.024497,0.017256
4,0.034295,-0.004926,-0.048317,-0.021204,0.021309,-0.01219,0.041694,0.038737,-0.03095,-0.040524,...,0.011588,0.003402,-0.037186,-0.042222,0.026007,-0.009855,0.015427,-0.048022,-0.04021,-0.045412
26785005,0.048453,0.04882,-0.01388,-0.048218,-0.021308,-0.000733,0.040685,0.022525,-0.026965,0.025442,...,0.038422,0.036993,-0.027438,0.012715,-0.003953,0.028972,0.005908,0.039267,0.032524,-0.048374
5,-0.020131,-0.037853,0.049698,0.033552,0.040174,-0.004757,-0.01089,0.034913,0.045555,-0.007623,...,0.009905,0.007745,-0.03632,0.023387,0.040451,-0.022704,-0.006798,0.019903,-0.018671,-0.029075
