# <center> Train LSTM with word2vec embeddings </center>

https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings and https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [23]:
import gensim
#import h5py
import numpy as np
from IPython.display import display
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from unidecode import unidecode
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

from capstone_project import preprocessor as pre
from capstone_project.models import neural_nets

Set some necessary constants:

In [2]:
MAX_SEQUENCE_LENGTH = 50 # Maximum length of input for lstm the maximum number of tokens is 103 
EMBEDDING_DIM = 300  # Length of the used word2vec implementation

In [3]:
file_directory = "../output/data/"
prefix = "tokenized_"

train_data = pre.load_pickle(file_directory, prefix+"train_data.pkl")
#val_data = pre.load_pickle(file_directory, prefix+"val_data.pkl")  # Validation data set used to compare different classification algorithms
train_y = train_data["is_duplicate"].values
#val_y = val_data["is_duplicate"].values


train_data =  train_data[:100]
train_y = train_y[:100]


Prepare the tokenized question as input for keras:

In [4]:
# Decode again and join strings because keras tokenizer crashes when using unicode while spacy uses it
q1_tokens = train_data["q1_tokens"].apply(lambda x: unidecode(" ".join(x))).values
q2_tokens = train_data["q2_tokens"].apply(lambda x: unidecode(" ".join(x))).values
all_tokens = np.concatenate([q1_tokens, q2_tokens])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_tokens)

word_index = tokenizer.word_index
number_words = len(word_index)+1  # Needed for embedding layer
print("Found {} unique tokens".format(len(word_index)))

q1_sequences = tokenizer.texts_to_sequences(q1_tokens)
q2_sequences = tokenizer.texts_to_sequences(q2_tokens)


q1_data = pad_sequences(q1_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(q2_sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 617 unique tokens


split second val and train set for validation at every epoch:

In [5]:
# split the data into a training set and a second validation set see: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
indices = np.arange(q1_data.shape[0])
np.random.shuffle(indices)
q1_data = q1_data[indices]
q2_data = q2_data[indices]
labels = train_y[indices]

nb_validation_samples = int(0.1 * q1_data.shape[0])

q1_train = q1_data[:-nb_validation_samples]
q2_train = q1_data[:-nb_validation_samples]
train_labels = labels[:-nb_validation_samples]

q1_val = q1_data[-nb_validation_samples:]
q2_val = q2_data[-nb_validation_samples:]
val_labels = labels[-nb_validation_samples:]

#TODO
#data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
#data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
#labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

#data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
#data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
#labels_val = np.concatenate((labels[idx_val], labels[idx_val]))


Load the pretrained word2vec model:

In [6]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin.gz", binary=True)

In [12]:
nn_parameters = {"max_sequence_length": MAX_SEQUENCE_LENGTH,
                 "num_lstm": 200,
                 "dropout_lstm": 0.25,
                 "num_dense": 100,
                 "dropout_dense": 0.25}

stamp = "{}_{:2f}_{}_{:.2f}".format(nn_parameters["num_lstm"], 
                                    nn_parameters["dropout_lstm"],
                                    nn_parameters["num_dense"],
                                    nn_parameters["dropout_dense"])

embedding_matrix = neural_nets.create_embedding_matrix(word2vec_model, EMBEDDING_DIM, word_index, number_words)
model = neural_nets.create_lstm(embedding_matrix, EMBEDDING_DIM, number_words, **nn_parameters)

Null word embeddings: 50


 fit the train data with early stopping:

In [17]:
# See https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
model.summary()
print stamp

early_stopping = EarlyStopping(monitor='val_loss', patience=10)
best_model_path = "../output/models/lstm_val_" + stamp + '.h5'
model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([q1_train, q2_train], train_labels,
                 validation_data=([q1_val, q2_val], val_labels), 
                 epochs=200, batch_size=8, shuffle=True,
                 callbacks=[early_stopping, model_checkpoint]) #batch_size = 2048

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 50)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 50)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 50, 300)       185400      input_1[0][0]                    
                                                                   input_2[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 200)           400800      embedding_1[0][0]       

In [20]:
model_path = "../output/models/lstm_" + stamp + '.h5'
model.fit([q1_data, q2_data], train_y, epochs=3, batch_size=8, shuffle=True,) #batch_size = 2048
model.save(model_path)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [24]:
model = load_model(model_path)



<keras.engine.training.Model object at 0x7f22567d0dd0>
