## Skipgrams in Keras

- In this lecture, we will implement Skipgrams in `Keras`.

#### Loading in and preprocessing data
- Load the Alice in Wonderland data in Corpus using Keras utility
- `Keras` has some nice text preprocessing features too!
- Split the text into sentences.
- Use `Keras`' `Tokenizer` to tokenize sentences into words.

In [51]:
# Imports
# Basics
from __future__ import print_function, division
import pandas as pd 
import numpy as np
import random
from IPython.display import SVG
%matplotlib inline

# nltk
from nltk import sent_tokenize

# keras
np.random.seed(13)
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Activation
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text
from keras.utils.vis_utils import model_to_dot 
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# We'll use Alice in Wonderland
path = get_file('carrol-alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')
corpus = open(path, encoding='utf-8').read()

In [3]:
# Split document into sentences first
corpus = corpus[corpus.index('\n\n')+2:]
sentences = sent_tokenize(corpus)

base_filter = '!"#$%&()*+,-./:;`<=>?@[\\]^_{|}~\t\n' + "'"
tokenizer = Tokenizer(filters=base_filter)
tokenizer.fit_on_texts(sentences)

sequences = tokenizer.texts_to_sequences(sentences)
nb_samples = sum(len(s) for s in corpus)

print(len(sequences), tokenizer.document_count)

1104 1104


In [4]:
# To understand what is happening;
print(sentences[324])
print(sequences[324])

“Keep your temper,” said the Caterpillar.
[2354, 66, 769, 2, 9, 1, 166]


#### Skipgrams: Generating Input and Output Labels
- Now that we have sentences, and word tokenization, we are in good position to create our training set for skipgrams.
- Now we need to generate our `X_train` and `y_train`

In [5]:
# Let's first see how Keras' skipgrams function works.
couples, labels = skipgrams(sequences[324],
                            len(tokenizer.word_index) + 1,
                            window_size=2, 
                            negative_samples=0, 
                            shuffle=True, 
                            categorical=False, 
                            sampling_table=None)

index_2_word = {val : key for key, val in tokenizer.word_index.items()}

for w1, w2 in couples:
    if w1 == 13:
        print(index_2_word[w1], index_2_word[w2])


In [6]:
# Function to generate the inputs and outputs for all windows
vocab_size = len(tokenizer.word_index) + 1
dim = 100
window_size = 2

def generate_data(sequences, window_size, vocab_size):
    for seq in sequences:
        X, y = [], []
        couples, _ = skipgrams(seq, 
                              vocab_size, 
                              window_size=window_size, 
                              negative_samples=0, 
                              shuffle=True, 
                              categorical=False, 
                              sampling_table=None)
        
        if not couples:
            continue
        
        for in_word, out_word in couples:
            X.append(in_word)
            y.append(np_utils.to_categorical(out_word, vocab_size))
            
        X, y = np.array(X), np.array(y)
        X = X.reshape(len(X), 1)
        y = y.reshape(len(X), vocab_size)
        yield X, y
        
data_generator = generate_data(sequences, window_size, vocab_size)

### Skipgrams: Creating the Model
- Lastly, we create the (shallow) network!

In [7]:
# Create the Keras model and view it 
skipgram = Sequential()
skipgram.add(Embedding(input_dim=vocab_size, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
skipgram.add(Reshape((dim,)))
skipgram.add(Dense(input_dim=dim, units=vocab_size, activation='softmax'))
#SVG(model_to_dot(skipgram, show_shapes=True).create(prog='dot', format='svg'))

### Skipgrams: Compiling and Training
- Time to compile and train
- We use crossentropy, common loss for classification

In [8]:
# Compile the Keras Model
from keras.optimizers import SGD

sgd = SGD(lr=1e-4, decay=1e-6, momentum=0.9)

skipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')

for iteration in range(10):
    loss = 0
    for x, y in generate_data(sequences, window_size, vocab_size):
        loss = loss + skipgram.train_on_batch(x, y)
        
    print('iteration {}, loss is {}'.format(iteration, loss))

iteration 0, loss is 8942.210822105408
iteration 1, loss is 8941.840028762817
iteration 2, loss is 8941.469831466675
iteration 3, loss is 8941.099581718445
iteration 4, loss is 8940.72947883606
iteration 5, loss is 8940.359350204468
iteration 6, loss is 8939.989276885986
iteration 7, loss is 8939.618958473206
iteration 8, loss is 8939.248700141907
iteration 9, loss is 8938.878479003906


### Skipgrams: Looking at the vectors

To get word_vectors now, we look at the weights of the first layer.

Let's also write functions giving us similarity of two words.

In [9]:
word_vectors = skipgram.get_weights()[0]

from scipy.spatial.distance import cosine

def get_dist(w1, w2):
    i1, i2 = tokenizer.word_index[w1], tokenizer.word_index[w2]
    v1, v2 = word_vectors[i1], word_vectors[i2]
    return cosine(v1, v2)

def get_similarity(w1, w2):
    return get_dist(w1, w2)

def get_most_similar(w1, n=10):
    sims = {word : get_similarity(w1, word) for word in tokenizer.word_index.keys() if word != w1}
    sims = pd.Series(sims)
    sims.sort_values(inplace=True, ascending=False)
    return sims.iloc[:n]


print(get_similarity('king', 'queen'))
print('')
print(get_most_similar('queen'))

1.0213647037744522

shakespeare    1.322887
they’d         1.313449
shelves        1.306200
came           1.293437
contents       1.291765
alteration     1.286259
game           1.285576
are            1.280995
sat            1.280045
hung           1.269913
dtype: float64


## Your turn -- Modify the code above to create a CBOW Model

In [48]:
corpus = corpus[corpus.index('\n\n')+2:]
sentences = sent_tokenize(corpus)

base_filter = '!"#$%&()*+,-./:;`<=>?@[\\]^_{|}~\t\n' + "'"

tokenizer_cbow = Tokenizer(filters=base_filter)

tokenizer_cbow.fit_on_texts(sentences)
sequences = tokenizer_cbow.texts_to_sequences(sentences)

In [57]:
vocab_size_cbow = len(tokenizer.word_index)
embed_size_cbow = 100
window_size_cbow = 2

def generate_data_cbow(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        
        for ix, word in enumerate(words):
            context_words = []
            label_word = []
            
            # set max and min range to say the number of words 
            # to get around (before/after) the target word
            start_ix = ix - window_size
            end_ix = ix + window_size + 1
            
            context_words.append([words[i] 
                                  for i in range(start_ix, end_ix) 
                                  if (0 <= i < sentence_length) and (i != ix)
                                 ])
            
            label_word.append(word)
            
            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield x, y
        
generate_data_cbow(sequences, window_size_cbow, vocab_size_cbow)

<generator object generate_data_cbow at 0x000002A6288D01A8>

In [58]:
i = 0
for x, y in generate_data_cbow(corpus=sequences, window_size=window_size_cbow, vocab_size=vocab_size_cbow):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['the', 'never', 'herself', 'remarkable'] -> Target (Y): limitation
Context (X): ['never', 'limitation', 'remarkable', 'on'] -> Target (Y): herself
Context (X): ['limitation', 'herself', 'on', 'like'] -> Target (Y): remarkable
Context (X): ['herself', 'remarkable', 'like', 'a'] -> Target (Y): on
Context (X): ['remarkable', 'on', 'a', 'federal'] -> Target (Y): like
Context (X): ['on', 'like', 'federal', 'for'] -> Target (Y): a
Context (X): ['like', 'a', 'for', 'much'] -> Target (Y): federal
Context (X): ['a', 'federal', 'much', 'turtle'] -> Target (Y): for
Context (X): ['federal', 'for', 'turtle', 'and'] -> Target (Y): much
Context (X): ['for', 'much', 'and', 'by'] -> Target (Y): turtle
Context (X): ['much', 'turtle', 'by', 'nor'] -> Target (Y): and


In [59]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda


cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size_cbow, output_dim=embed_size_cbow, input_length=window_size_cbow*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size_cbow,)))
cbow.add(Dense(vocab_size_cbow, activation='softmax'))

# view model summary
print(cbow.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            354700    
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3547)              358247    
Total params: 712,947
Trainable params: 712,947
Non-trainable params: 0
_________________________________________________________________
None


In [61]:
from keras.optimizers import SGD

cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

for iteration in range(5):
    loss = 0
    for x, y in generate_data_cbow(corpus=sequences, window_size=window_size_cbow, vocab_size=vocab_size_cbow):
        loss = loss + cbow.train_on_batch(x, y)
        
    print('iteration {}, loss is {}'.format(iteration, loss))

iteration 0, loss is 249324.6297082901
iteration 1, loss is 249203.98899269104
iteration 2, loss is 249083.53575706482
iteration 3, loss is 248963.21166706085
iteration 4, loss is 248843.08365631104


In [63]:
word_vectors_cbow = cbow.get_weights()[0]

from scipy.spatial.distance import cosine

def get_dist_cbow(w1, w2):
    i1, i2 = tokenizer_cbow.word_index[w1], tokenizer_cbow.word_index[w2]
    v1, v2 = word_vectors_cbow[i1], word_vectors_cbow[i2]
    return cosine(v1, v2)

def get_similarity_cbow(w1, w2):
    return get_dist_cbow(w1, w2)

def get_most_similar_cbow(w1, n=10):
    sims = {word : get_similarity_cbow(w1, word) for word in tokenizer_cbow.word_index.keys() if word != w1}
    sims = pd.Series(sims)
    sims.sort_values(inplace=True, ascending=False)
    return sims.iloc[:n]


print(get_similarity_cbow('king', 'queen'))
print('')
print(get_most_similar_cbow('king'))

1.0695239901542664

sister’s     1.406187
unusually    1.318744
clock        1.307960
viewing      1.284597
flamingo     1.282939
or           1.279009
dinn         1.277009
david        1.276449
flustered    1.266452
earls        1.263939
dtype: float64
