# Build Sentence Vectors from Word Vectors

Ideally, would be preferable to train a network end-to-end to predict document similarity, but we need to start with word vectors to generate sentence vectors, and generate document vectors from sentence vectors. Because we will generate sentence vectos using different inputs (words in a sentence) than document vectors (sentences in a document), we will build an autoencoder to generate our sentence vectors below.

In [None]:
from __future__ import division, print_function
from keras.callbacks import ModelCheckpoint
from keras.layers import Input
from keras.layers.core import RepeatVector
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
import os

In [None]:
DATA_DIR = "../data"

VOCAB_FILE = os.path.join(DATA_DIR, "rt-vocab.tsv")
SENT_FILE = os.path.join(DATA_DIR, "rt-sent.tsv")

GLOVE_FILE = os.path.join(DATA_DIR, "glove.840B.300d.txt")
WORD_EMBED_SIZE = 300

MAX_SEQLEN = 50
SENT_EMBED_SIZE = 512

BATCH_SIZE = 64
NUM_EPOCHS = 10

MODEL_FILE = os.path.join(DATA_DIR, "sent-encoder.h5")

## Create Vocabulary for word lookup

We replace any word that occurs 5 times or less across the corpus is replaced by the pseudo word \_UNK\_. The \_PAD\_ character is meant to padd short sentences with a standard character (0).

In [None]:
id2word = {}
id2word = {0: "_PAD_", 1: "_UNK_"}
fvoc = open(VOCAB_FILE, "rb")
for i, line in enumerate(fvoc):
    word, count = line.strip().split("\t")
    count = int(count)
    if count <= 5:
        continue
    id2word[i+2] = word
fvoc.close()
word2id = {v:k for k,v in id2word.items()}
vocab_size = len(word2id)
print("vocab size: {:d}".format(vocab_size))

## Extract embeddings from GloVe

In [None]:
E = np.zeros((len(word2id), GLOVE_EMBED_SIZE))
# UNK is given a random value
E[1] = np.random.random((GLOVE_EMBED_SIZE))
fglo = open(GLOVE_FILE, "rb")
for line in fglo:
    cols = line.strip().split(" ")
    word = cols[0]
    vec = [float(x) for x in cols[1:]]
    try:
        i = word2id[word]
        E[i] = vec
    except KeyError:
        # word does not exist in vocab
        continue
fglo.close()
print(E.shape)

## Load sentences as word id sequences

In [None]:
xdata = []
fsent = open(SENT_FILE, "rb")
sent_id = 0
for line in fsent:
    _, _, sent = line.strip().split("\t")
    word_ids = []
    for word in nltk.word_tokenize(sent):
        try:
            word_id = word2id[word]
        except KeyError:
            word_id = word2id["_UNK_"]
        word_ids.append(word_id)
    xdata.append(np.array(word_ids))
fsent.close()
X = pad_sequences(np.array(xdata), MAX_SEQLEN)
print(X.shape)

In [None]:
Xtrain, Xtest = train_test_split(X, train_size=0.9)
print(Xtrain.shape, Xtest.shape)

## Data Generator

In [None]:
def datagen(X, E, batch_size=BATCH_SIZE):
    while True:
        # loop once per epoch
        num_recs = X.shape[0]
        indices = np.random.permutation(np.arange(num_recs))
        num_batches = num_recs // batch_size
        for bid in range(num_batches):
            sids = indices[bid * batch_size : (bid + 1) * batch_size]
            Xbatch = E[X[sids, :]]
            yield Xbatch, Xbatch
            
train_gen = datagen(Xtrain, E)
Xb, Xb = train_gen.next()
print(Xtrain.shape, Xtrain.shape)

## Define Autoencoder

In [None]:
inputs = Input(shape=(MAX_SEQLEN, WORD_EMBED_SIZE))
encoded = Bidirectional(LSTM(SENT_EMBED_SIZE), merge_mode="sum",
                       name="encoder_lstm")(inputs)
decoded = RepeatVector(MAX_SEQLEN)(encoded)
outputs = Bidirectional(LSTM(WORD_EMBED_SIZE, return_sequences=True),
                        merge_mode="sum")(decoded)

autoencoder = Model(inputs, outputs)

In [None]:
autoencoder.compile(optimizer="adam", loss="mse")

In [None]:
for layer in autoencoder.layers:
    print(layer.name, layer.input_shape, layer.output_shape)

## Train Autoencoder

In [None]:
num_train_steps = len(Xtrain) // BATCH_SIZE
num_test_steps = len(Xtest) // BATCH_SIZE

checkpoint = ModelCheckpoint(filepath=MODEL_FILE, save_best_only=True)

train_gen = datagen(Xtrain, E)
test_gen = datagen(Xtest, E)

history = autoencoder.fit_generator(train_gen, 
                                    steps_per_epoch=num_train_steps,
                                    epochs=NUM_EPOCHS,
                                    validation_data=test_gen,
                                    validation_steps=num_test_steps,
                                    callbacks=[checkpoint])

## Extract Encoder Portion

In [None]:
encoder = Model(autoencoder.input, 
                autoencoder.get_layer("encoder_lstm").output)