In [1]:
import numpy as np
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from math import ceil
import re

import keras
from keras.layers import Dense, Activation, Input, Dropout
from keras.models import Model

Using Theano backend.


In [2]:
f = open('../datasets/sherlock.txt')
text = f.read()
f.close()

In [3]:
data = text[3433:]

data = data.lower()
data = re.sub('[^A-Za-z]+', ' ', data)
data = data.split()

stop_words = stopwords.words('english')
data = [word for word in data if word not in stop_words] # To drastically reduce the size of X

In [4]:
len(data)

261745

In [5]:
vocabulary, counts = np.unique(data, return_counts=True)
vocabulary.shape, counts.shape

((17628,), (17628,))

In [6]:
vocabulary, counts = vocabulary[counts > 1], counts[counts > 1]
vocabulary.shape, counts.shape

((11369,), (11369,))

In [7]:
data = [w for w in data if w in vocabulary]
len(data)

255486

In [8]:
(vocabulary == "bag").argmax()

743

In [9]:
def get_one_hot_vector(word):
    vec = np.zeros((vocabulary.shape[0], ))
    index = (vocabulary == word).argmax()
    vec[index] = 1
    return vec

In [10]:
get_one_hot_vector("bag").argmax()

743

In [11]:
window = 10
n_epochs = 10
batch_size = 128
N = len(data) - 2*window
n_batches = ceil(N / batch_size)
n_context = vocabulary.shape[0] * 2 * window
n_embedding = 300

In [12]:
def batch_generator(epochs, with_y=True):
    
    for e in range(epochs):
        start = 0
        
        for b in range(n_batches):
            if start + batch_size <= N:
                X_batch = np.zeros((batch_size, n_context))
            else:
                X_batch = np.zeros((N - start, n_context))
                
            y_batch = []

            for i in range(start+window, start+X_batch.shape[0]+window):
                context = []
                for j in range(-window,window+1):
                    if j == 0:
                        if with_y:
                            y_batch.append(get_one_hot_vector(data[i]))
                        continue
                    context.append(get_one_hot_vector(data[i+j]))
                X_batch[i-window-start] = np.hstack(context)
                
            start += batch_size
            yield (X_batch if not with_y else (X_batch, np.array(y_batch)))
    
        
make_batch = batch_generator(n_epochs)

In [13]:
# Word Embedding

inp = Input(shape=(n_context,))
emb = Dense(n_embedding, activation='tanh')(inp)
emb = Dropout(0.4)(emb)
out = Dense(len(vocabulary), activation='softmax')(emb)

model = Model(inputs=inp, outputs=out)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

encoder = Model(inputs=inp, outputs=emb)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 227380)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               68214300  
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 11369)             3422069   
Total params: 71,636,369
Trainable params: 71,636,369
Non-trainable params: 0
_________________________________________________________________


In [14]:
checkpointer = keras.callbacks.ModelCheckpoint('./cbow.h5',
                                              save_best_only=False,
                                              save_weights_only=False,
                                              period=1,
                                            )

In [15]:
hist = model.fit_generator(
                make_batch,
                steps_per_epoch=n_batches,
                epochs=n_epochs,
                shuffle=True,
                callbacks=[checkpointer]
            )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
 300/1996 [===>..........................] - ETA: 1:25:45 - loss: 0.1378 - acc: 0.9911

KeyboardInterrupt: 

In [16]:
plt.figure()
plt.plot(hist.history['loss'], 'b')
# plt.plot(hist.history['val_loss'], 'g')
plt.show()

plt.figure()
plt.plot(hist.history['acc'], 'b')
# plt.plot(hist.history['val_acc'], 'g')
plt.show()

NameError: name 'hist' is not defined

<Figure size 432x288 with 0 Axes>

In [17]:
embedding_vectors = np.zeros((N, n_embedding))
make_pred_batch = batch_generator(epochs=1, with_y=False)

In [18]:
size = 0
while True:
    try:
#         print(size / batch_size)
        x = next(make_pred_batch)
        vecs = encoder.predict(x)
        embedding_vectors[size:x.shape[0]+size, :] = vecs
        size += x.shape[0]
    except StopIteration:
        print("Ended")
        break

Ended


In [19]:
w2v = {}
alpha = 0.9

for i in range(window, len(data)-window):
    word = data[i]
    pred = embedding_vectors[i - window]
    try:
        old_vec = w2v[word]
        new_vec = alpha*old_vec + (1-alpha)*pred # Running Average
        w2v[word] = new_vec
    except KeyError:
        w2v[word] = pred
print(len(w2v.keys()))

11369


In [20]:
def cosine_similarity(v1, v2):
    return np.dot(v1, v2)/np.sqrt((v1**2).sum()*(v2**2).sum())

In [30]:
v1 = w2v['mr']
v2 = w2v['sherlock']
print(cosine_similarity(v1, v2))

0.562701824563958
