# <center> Train LSTM with word2vec embeddings </center>

https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings and https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [1]:
import gensim
import numpy as np
from IPython.display import display
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from unidecode import unidecode
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from capstone_project import preprocessor as pre

Using TensorFlow backend.


Set some necessary constants:

In [2]:
MAX_SEQUENCE_LENGTH = 50 # Maximum length of input for lstm the maximum number of tokens is 103 
EMBEDDING_DIM = 300  # Length of the used word2vec implementation

In [3]:
file_directory = "../output/data/"
prefix = "tokenized_"

train_data = pre.load_pickle(file_directory, prefix+"train_data.pkl")
#val_data = pre.load_pickle(file_directory, prefix+"val_data.pkl")  # Validation data set used to compare different classification algorithms
train_y = train_data["is_duplicate"].values
#val_y = val_data["is_duplicate"].values

Prepare the tokenized question as input for keras:

In [4]:
# Decode again and join strings because keras tokenizer crashes when using unicode while spacy uses it
q1_tokens = train_data["q1_tokens"].apply(lambda x: unidecode(" ".join(x))).values
q2_tokens = train_data["q2_tokens"].apply(lambda x: unidecode(" ".join(x))).values
all_tokens = np.concatenate([q1_tokens, q2_tokens])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_tokens)

word_index = tokenizer.word_index
print("Found {} unique tokens".format(len(word_index)))

q1_sequences = tokenizer.texts_to_sequences(q1_tokens)
q2_sequences = tokenizer.texts_to_sequences(q2_tokens)


q1_data = pad_sequences(q1_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(q2_sequences, maxlen=MAX_SEQUENCE_LENGTH)

  _warn_if_not_unicode(string)


Found 67433 unique tokens


split second val and train set for validation at every epoch:

In [5]:
# split the data into a training set and a second validation set see: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
indices = np.arange(q1_data.shape[0])
np.random.shuffle(indices)
q1_data = q1_data[indices]
q2_data = q2_data[indices]
labels = train_y[indices]

nb_validation_samples = int(0.1 * q1_data.shape[0])

q1_train = q1_data[:-nb_validation_samples]
q2_train = q1_data[:-nb_validation_samples]
train_labels = labels[:-nb_validation_samples]

q1_val = q1_data[-nb_validation_samples:]
q2_val = q2_data[-nb_validation_samples:]
val_labels = labels[-nb_validation_samples:]

#TODO
#data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
#data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
#labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

#data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
#data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
#labels_val = np.concatenate((labels[idx_val], labels[idx_val]))


Load the pretrained word2vec model:

In [6]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin.gz", binary=True)


In [7]:
# See https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings


########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

#nb_words = min(MAX_NB_WORDS, len(word_index))+1
number_words = len(word_index)+1

embedding_matrix = np.zeros((number_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec_model.vocab:
        embedding_matrix[i] = word2vec_model.word_vec(word)
print("Null word embeddings: {}".format(np.sum(np.sum(embedding_matrix, axis=1) == 0)))



Preparing embedding matrix
Null word embeddings: 30555


In [8]:
#num_lstm = np.random.randint(175, 275)
#num_dense = np.random.randint(100, 150)
#rate_drop_lstm = 0.15 + np.random.rand() * 0.25
#rate_drop_dense = 0.15 + np.random.rand() * 0.25

num_lstm = 200
num_dense = 100
rate_drop_lstm = 0.25
rate_drop_dense = 0.25

stamp = "{}_{}_{:2f}_{:.2f}".format(num_lstm, num_dense, rate_drop_lstm, rate_drop_dense)

In [9]:
# See https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings

embedding_layer = Embedding(number_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation="relu")(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [10]:
# See https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings


########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
model.summary()
print(stamp)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
best_model_path = "../output/models/lstm_" + stamp + '.h5'
model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([q1_train, q2_train], train_labels, \
        validation_data=([q1_val, q2_val], val_labels), \
        epochs=200, batch_size=2048, shuffle=True, \
        callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 50)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 50)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 50, 300)       20230200    input_1[0][0]                    
                                                                   input_2[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 200)           400800      embedding_1[0][0]       

ResourceExhaustedError: OOM when allocating tensor with shape[2048,50,300]
	 [[Node: lstm_1_1/Tile_2 = Tile[T=DT_FLOAT, Tmultiples=DT_INT32, _device="/job:localhost/replica:0/task:0/gpu:0"](lstm_1_1/ExpandDims_1, lstm_1_1/stack)]]
	 [[Node: mul_1/_113 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_6416_mul_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op u'lstm_1_1/Tile_2', defined at:
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-399e3e120cc7>", line 16, in <module>
    y1 = lstm_layer(embedded_sequences_2)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/keras/layers/recurrent.py", line 262, in __call__
    return super(Recurrent, self).__call__(inputs, **kwargs)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/keras/engine/topology.py", line 596, in __call__
    output = self.call(inputs, **kwargs)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/keras/layers/recurrent.py", line 333, in call
    preprocessed_input = self.preprocess_input(inputs, training=None)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/keras/layers/recurrent.py", line 1077, in preprocess_input
    timesteps, training=training)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/keras/layers/recurrent.py", line 45, in _time_distributed_dense
    expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py", line 1865, in repeat
    return tf.tile(x, pattern)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3618, in tile
    name=name)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/andre/software/anaconda3/envs/capstone_project/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[2048,50,300]
	 [[Node: lstm_1_1/Tile_2 = Tile[T=DT_FLOAT, Tmultiples=DT_INT32, _device="/job:localhost/replica:0/task:0/gpu:0"](lstm_1_1/ExpandDims_1, lstm_1_1/stack)]]
	 [[Node: mul_1/_113 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_6416_mul_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
