# <center> Train LSTM with word2vec embeddings </center>

First I would like to mention these two excellent posts: 

https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

My solution is in part based on these two guidelines.

Okay now let's import all of the necessary modules:

In [1]:
#import os
import gensim
import numpy as np
#import matplotlib.pyplot as plt
from IPython.display import display
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from unidecode import unidecode
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from sklearn.metrics import log_loss, accuracy_score
#from sklearn.metrics import roc_curve, auc
from capstone_project import utility
from capstone_project.models import neural_nets

%matplotlib inline

Using TensorFlow backend.


Set important constants and load data:

In [2]:
MAX_SEQUENCE_LENGTH = 30 # Maximum length of input for lstm the maximum number of tokens is 103 
EMBEDDING_DIM = 300  # Length of the used word2vec implementation

In [3]:
file_directory = "../output/data/"
prefix = "tokenized_"

train_data = utility.load_pickle(file_directory, prefix+"train_data.pkl")
val_data = utility.load_pickle(file_directory, prefix+"val_data.pkl")  # Validation data set used to compare different classification algorithms
train_y = train_data["is_duplicate"].values
val_y = val_data["is_duplicate"].values

In [4]:
#train_data = train_data[:100]
#val_data = val_data[:100]
display(train_data.head(1))

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens
355802,355802,696825,696826,Which are the best songs of Enrique Iglesias?,Which is the best song of Enrique iglesias?,1,"[good, song, enrique, iglesias]","[good, song, enrique, iglesias]"


Prepare the tokenized question as input for keras:

In [5]:
# Decode again and join strings because keras tokenizer crashes when using unicode while spacy uses it
#q1 = train_data["q1_tokens"].apply(lambda x: unidecode(u" ".join(x))).values
#q2 = train_data["q2_tokens"].apply(lambda x: unidecode(u" ".join(x))).values
q1 = train_data["question1"].values
q2 = train_data["question2"].values

all_questions = np.concatenate([q1, q2])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_questions)

word_index = tokenizer.word_index
number_words = len(word_index)+1  # Needed for embedding layer
print("Found {} unique tokens".format(len(word_index)))

q1_sequences = tokenizer.texts_to_sequences(q1)
q2_sequences = tokenizer.texts_to_sequences(q2)

q1_data = pad_sequences(q1_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(q2_sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 82267 unique tokens


split second val and train set for validation at every epoch:

Doble the dataset size by switching the order of the questions. This is done in order to avoid symmetry issues.

In [6]:
# Split the data into another training set and a second validation set 
# See: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html and
#indices = np.arange(q1_data.shape[0])
#np.random.shuffle(indices)
#q1_data = q1_data[indices]
#q2_data = q2_data[indices]
#labels = train_y[indices]

#nb_validation_samples = int(0.1 * q1_data.shape[0])

# Create subset for training with early stopping
#q1_train = q1_data[:-nb_validation_samples]
#q2_train = q1_data[:-nb_validation_samples]
#train_labels = labels[:-nb_validation_samples]

#create validation subset that is used to validate each epoch during training
#q1_val_epochs = q1_data[-nb_validation_samples:]
#q2_val_epochs = q2_data[-nb_validation_samples:]
#val_epochs_labels = labels[-nb_validation_samples:]

In [7]:
# kaggle ...
#q12_train = np.concatenate((q1_train, q2_train), axis=0)
#q21_train = np.concatenate((q2_train, q1_train), axis=0)
#double_train_labels = np.concatenate((train_labels, train_labels), axis=0)

#q12_val_epochs = np.concatenate((q1_val_epochs, q2_val_epochs), axis=0)
#q21_val_epochs = np.concatenate((q2_val_epochs, q1_val_epochs), axis=0)
#double_val_epochs_labels = np.concatenate((val_epochs_labels, val_epochs_labels), axis=0)

In [8]:
# Credit: lstm kaggle
VALIDATION_SPLIT = 0.1
labels = train_y

perm = np.random.permutation(len(q1_data))
idx_train = perm[:int(len(q1_data)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(q1_data)*(1-VALIDATION_SPLIT)):]

q12_train = np.concatenate((q1_data[idx_train], q2_data[idx_train]), axis=0)
q21_train = np.concatenate((q2_data[idx_train], q1_data[idx_train]), axis=0)
double_train_labels = np.concatenate((labels[idx_train], labels[idx_train]), axis=0)

q12_val_epochs = np.concatenate((q1_data[idx_val], q2_data[idx_val]), axis=0)
q21_val_epochs = np.concatenate((q2_data[idx_val], q1_data[idx_val]), axis=0)
double_val_epochs_labels = np.concatenate((labels[idx_val], labels[idx_val]), axis=0)

Prepare validation set:

In [9]:
#create correct embeddings for validation data
#q1_validation = val_data["q1_tokens"].apply(lambda x: unidecode(u" ".join(x))).values
#q2_validation = val_data["q2_tokens"].apply(lambda x: unidecode(u" ".join(x))).values
q1_validation = val_data["question1"].values
q2_validation = val_data["question2"].values


q1_val_sequences = tokenizer.texts_to_sequences(q1_validation)
q2_val_sequences = tokenizer.texts_to_sequences(q2_validation)

q1_val_data = pad_sequences(q1_val_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_val_data = pad_sequences(q2_val_sequences, maxlen=MAX_SEQUENCE_LENGTH)
val_labels = val_y

Load the pretrained word2vec model:

In [10]:
#Credit: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
#import os
#
#embeddings_index = {}
#f = open('../data/glove.42B.300d.txt')
#for line in f:
#    values = line.split()
#    word = values[0]
#    coefs = np.asarray(values[1:], dtype='float32')
#    embeddings_index[word] = coefs
#f.close()

#print('Found %s word vectors.' % len(embeddings_index))

In [11]:
# Credit: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
#embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
#for word, i in word_index.items():
#    embedding_vector = embeddings_index.get(word)
#    if embedding_vector is not None:
#        # words not found in embedding index will be all-zeros.
#        embedding_matrix[i] = embedding_vector

In [12]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin.gz", binary=True)

Set the parameters of the lstm, create the emebedding matrix and create a keras model:

In [13]:
batch_size = 2048
nn_parameters = {"max_sequence_length": MAX_SEQUENCE_LENGTH,
                 "num_lstm": 230,
                 "dropout_lstm": 0.3,
                 "num_dense": 128,
                 "dropout_dense": 0.3}

stamp = "{}_{:.2f}_{}_{:.2f}".format(nn_parameters["num_lstm"], 
                                    nn_parameters["dropout_lstm"],
                                    nn_parameters["num_dense"],
                                    nn_parameters["dropout_dense"])

embedding_matrix = neural_nets.create_embedding_matrix(vec_model=word2vec_model, 
                                                       embedding_dim=EMBEDDING_DIM, 
                                                       word_index=word_index, 
                                                       number_words=number_words)

model = neural_nets.create_lstm(embedding_matrix=embedding_matrix, 
                                embedding_dim=EMBEDDING_DIM, 
                                number_words=number_words, 
                                **nn_parameters)

model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
model.summary()
print stamp

Null word embeddings: 38543
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 30)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 30)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 30, 300)       24680400    input_1[0][0]                    
                                                                   input_2[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 230)           488520  

Train the data and check the performance on the second validation set every epoch. with early stopping:

In [14]:
# See https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings


early_stopping = EarlyStopping(monitor='val_loss', patience=3)
best_model_path = "../output/models/lstm_val_epochs_" + stamp + '.h5'
model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([q12_train, q21_train], double_train_labels,
                 validation_data=([q12_val_epochs, q21_val_epochs], double_val_epochs_labels), 
                 epochs=200, batch_size=batch_size, shuffle=True,
                 callbacks=[early_stopping, model_checkpoint])

Train on 524036 samples, validate on 58228 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200

KeyboardInterrupt: 

Fit the complete train set using the number of epochs found above:

In [None]:
#TODO do stuff with hist

#524036/524036 [==============================] - 194s - loss: 0.3470 - acc: 0.8404 - val_loss: 0.3801 - val_acc: 0.8247

Load the trained model and calculate logloss and accuarcy on the validation set:

In [None]:
%model = load_model(model_path)

predictions = model.predict([q1_val_data, q2_val_data], batch_size=batch_size, verbose=1)
predictions += model.predict([q2_val_data, q1_val_data], batch_size=batch_size, verbose=1)
predictions /= 2

loss = log_loss(val_y, predictions)
acc = accuracy_score(val_y, np.rint(predictions))

print "Validation scores of Lstm model\n LogLoss: {:.4f}\n Accuracy: {:.2f} ".format(loss, acc)

Create roc plot and save it:

In [None]:
fpr, tpr, thresholds = roc_curve(val_y, predictions)
roc_auc = auc(fpr, tpr)

lw = 2
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.savefig("../output/figures/lstm_roc_plot.png")
plt.show()