# Skipgram Implementation in Keras

<b>Author Name:Hongxiang Yang<b/>

<b>Email: hongxiangy@student.unimelb.edu.au<b/>

<b>Python version used:Python 3.5.4 :: Anaconda, Inc.<b/>

In [2]:
import string
import time
import keras
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.corpus import brown
from collections import Counter
from keras.models import Model
from keras.layers import Input, Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.sequence import skipgrams
from tensorflow.python.client import device_lib
import warnings

Using TensorFlow backend.


## Training the model with GPU 
(Envirnment setup: https://github.com/senior88oqz/dlwin.git)

In [3]:
warnings.filterwarnings("ignore")
print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5485759225754551552
, name: "/gpu:0"
device_type: "GPU"
memory_limit: 1557040332
locality {
  bus_id: 1
}
incarnation: 13003114433564137188
physical_device_desc: "device: 0, name: GeForce GTX 950, pci bus id: 0000:01:00.0"
]


## Implements function to clean, preprocess raw text data for training

In [3]:

def get_unk(word, vocab):
    """
    :return: replacing uncapctured word as 'UNK'
    """
    return word if word in vocab else 'UNK'


# ntlk.download(): stopwords, punkt
def preprocessing(text, vocab_size):
    """
    :param text: string 
    :param vocab_size: int 
    :return: processed token(list), word_count(Counter), vocab(list)
    """
    start_time = time.time()
    stop_words = set(stopwords.words('english'))
    punctuations = set(string.punctuation)
    exclusions = stop_words.union(punctuations)
    token = word_tokenize(text)
    processed = [word.lower()
                 for word in token
                 if word.lower() not in exclusions]
    vocab = [word_count[0]
             for word_count in Counter(processed).most_common(vocab_size - 1)]
    if vocab_size < len(processed):
        vocab.insert(0, 'UNK')
    processed = [get_unk(word, vocab) for word in processed]
    word_count = Counter(processed)
    print('Preprocessing done in %s seconds' % ((time.time() - start_time)))
    return processed, word_count, vocab


def get_lookup_tables(vocab):
    """
    building word <-> index dictionaries for given vocab
    :param vocab: 
    :return: word_index(dict), index_word(dict)
    """
    word_index = {}
    index_word = {}
    for index, word in enumerate(vocab):
        word_index.setdefault(word, index)
        index_word.setdefault(index, word)
    return word_index, index_word


def get_indexed_text(tokens, word_index):
    """
    One-hot encoding text tokens
    :param tokens: text tokens obtained from previous function 
    :param word_index: 
    :return: list of text in one-hot encoding
    """
    indexed_text = [word_index[word] for word in tokens]
    return indexed_text


def build_data_set(text, vocab_size):
    """
    Warpup the above functions and prepare the data_set for training
    """
    processed, word_count, vocab = preprocessing(text, vocab_size)
    word_index, index_word = get_lookup_tables(vocab)
    indexed_text = get_indexed_text(processed, word_index)
    del vocab
    return indexed_text, word_count, word_index, index_word

## Hyper Parameter Setup

In [4]:
vocab_size = 3000
window_size = 5
vec_dim = 300
epochs = 10
batch_size = 512

In [7]:
raw_text = ' '.join(brown.words()[:])
indexed_text, word_count, word_index, index_word = build_data_set(raw_text, vocab_size)

Preprocessing done in 23.17365074157715 seconds


## Data setup for keras

In [8]:
# Subsampling word from text
# i.e. word_sample_table[i] is the prob. of sampling ith most common word
# more common -> lower prob.
word_sample_table = sequence.make_sampling_table(vocab_size)

word_pairs, labels = skipgrams(indexed_text, vocab_size, shuffle=True, negative_samples=2,
                               window_size=window_size, sampling_table=word_sample_table)

word_target, word_context = zip(*word_pairs)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

## Builds up the training models with keras

In [9]:
# # input layer
input_target = Input((1,), name='target_input')
input_context = Input((1,), name='context_input')

# # embedding layer
embedding = Embedding(vocab_size, vec_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vec_dim, 1), name='taget_reshape')(target)
context = embedding(input_context)
context = Reshape((vec_dim, 1), name='context_reshape')(context)

# # now perform the dot product operation to get a similarity measure
dot_product = keras.layers.dot([target, context], axes=1,
                               name='dot_product', normalize=True)
dot_product = Reshape((1,), name='dot_product_reshape')(dot_product)
# # add the sigmoid output layer
output = Dense(1, activation='sigmoid', name='sigmoid')(dot_product)
# # create the primary training model
model = Model(input=[input_target, input_context], output=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
# # intermediate layer to compute cosine similarity between two words
validation_model = Model(input=[input_target, input_context], output=dot_product)
# # intermediate layer to convert one-hot code (sparse) into word embeddings (dense) 
# # which can be used in many other nlp tasks
vec_model = Model(input=input_target, output=target)


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
target_input (InputLayer)        (None, 1)             0                                            
____________________________________________________________________________________________________
context_input (InputLayer)       (None, 1)             0                                            
____________________________________________________________________________________________________
embedding (Embedding)            (None, 1, 300)        900000      target_input[0][0]               
                                                                   context_input[0][0]              
____________________________________________________________________________________________________
taget_reshape (Reshape)          (None, 300, 1)        0           embedding[0][0]         

## Training the model

In [10]:
model.fit([word_target, word_context], labels,
          validation_split=0,
          batch_size=batch_size, epochs=epochs, verbose=2)

Epoch 1/10
36s - loss: 0.6022 - acc: 0.6938
Epoch 2/10
35s - loss: 0.5140 - acc: 0.7469
Epoch 3/10
35s - loss: 0.4652 - acc: 0.7830
Epoch 4/10
35s - loss: 0.4276 - acc: 0.8103
Epoch 5/10
35s - loss: 0.3961 - acc: 0.8307
Epoch 6/10
35s - loss: 0.3690 - acc: 0.8464
Epoch 7/10
35s - loss: 0.3453 - acc: 0.8594
Epoch 8/10
35s - loss: 0.3242 - acc: 0.8701
Epoch 9/10
35s - loss: 0.3050 - acc: 0.8792
Epoch 10/10
35s - loss: 0.2883 - acc: 0.8868


<keras.callbacks.History at 0x25c97d389b0>

## Application of the model
- word2vec: produce word embeding for a string word
- skipgram: given a target word return the top k likely word in a predefine window

In [11]:
def word2vec(word, vec_model, word_index=word_index):
    index = np.array([word_index[word]])
    return vec_model.predict_on_batch(index).flatten()



def get_k_most_common_context(target, validation_model, top_k,
                              lookups=(word_index, index_word), vocab_size=vocab_size):
    word_index, index_word = lookups
    target_indx = np.array([word_index[target]])
    similarities = np.zeros((vocab_size,))
    for i in range(vocab_size):
        context_indx = np.array([i])
        sim = validation_model.predict_on_batch([target_indx, context_indx])
        similarities[i] = sim
    nearest = (-similarities).argsort()[1:top_k + 1]
    out = [index_word[word] for word in nearest]
    print("%d Nearest to %s :" % (top_k, target), out)
    return out

In [18]:
vec = word2vec('milk', vec_model)
print("dimension: ", len(vec))
print(vec)

dimension:  300
[ 0.0023099  -0.0469316   0.26281762  0.04457152  0.15659074 -0.19045512
 -0.14653884 -0.16654626 -0.16516408  0.05719168  0.0065063  -0.18198052
 -0.03592599 -0.04285887  0.04855128 -0.07246569 -0.14164248  0.00507005
 -0.06912898 -0.25567499 -0.08014426  0.13297312 -0.06913213 -0.13081819
 -0.15918715  0.15340981 -0.23020993  0.17676426  0.02816044 -0.29095277
  0.14475267  0.01372892  0.06072622  0.13990726 -0.36064938 -0.19122009
  0.05285533  0.26722857  0.17350371 -0.28393173  0.08968197 -0.3094438
  0.33192924  0.02098076  0.0808959  -0.20219873 -0.00732423 -0.2924397
  0.10726909  0.06473857 -0.12077001  0.40374848  0.12536813 -0.08836307
  0.48754469  0.03483732 -0.0346945   0.03701563  0.04345041  0.0367719
  0.10236261 -0.25089979 -0.04918829  0.04756913 -0.06396046 -0.26823333
  0.11190939 -0.05368825 -0.05699518  0.14960416  0.1264948   0.15372215
  0.05643407  0.30918989  0.08186588  0.37794787  0.05138092  0.21816196
 -0.1275795   0.07869629 -0.05768437  

In [17]:
get_k_most_common_context('milk', validation_model, top_k=8)

8 Nearest to white milk : ["'s", 'breakfast', '``', 'providing', 'put', 'production', 'feed', 'foods']


["'s", 'breakfast', '``', 'providing', 'put', 'production', 'feed', 'foods']