### This notebook is used for modeling with Siamese Network based on Birectional Long Short Term Memory Network

In [1]:
from __future__ import print_function

import numpy as np
import csv, json
from zipfile import ZipFile
from os.path import expanduser, exists
import pickle
import pandas as pd

import keras
from keras.models import Sequential, Model, model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model
from keras.utils.data_utils import get_file
from keras.layers import Input, Dense, Conv1D, Dropout, MaxPooling1D, Flatten, Embedding, LSTM, Bidirectional, merge, dot
from keras import backend as K
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 25

In [3]:
# load data
with open('./data/training_py2.pickle', 'rb') as handle:
    question1_word_sequences, question2_word_sequences, y_train = pickle.load(handle)
    
with open('./data/testing_py2.pickle', 'rb') as handle:
    question1_test_ws, question2_test_ws = pickle.load(handle)

In [4]:
# load embedding matrix
with open('./data/word_embedding_matrix_glove100_py2.pickle', 'rb') as handle:
    word_embedding_glove100 = pickle.load(handle)

In [5]:
q1_data = pad_sequences(question1_word_sequences, maxlen = MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [58]:
# left subnet
q1 = Input(shape = (MAX_SEQUENCE_LENGTH, ), dtype = 'int32')
q1_emb = Embedding(137043, 100,
                            weights = [word_embedding_glove100],
                            input_length = MAX_SEQUENCE_LENGTH,
                            trainable = False, mask_zero = True)(q1)
q1_bi = Bidirectional(LSTM(64))(q1_emb)

# right subnet
q2 = Input(shape = (MAX_SEQUENCE_LENGTH, ), dtype = 'int32')
q2_emb = Embedding(137043, 100,
                            weights = [word_embedding_glove100],
                            input_length = MAX_SEQUENCE_LENGTH,
                            trainable = False, mask_zero = True)(q2)
q2_bi = Bidirectional(LSTM(64))(q2_emb)

# merge
merged = dot([q1_bi, q2_bi], axes = 1, normalize = True)

model = Model(inputs = [q1, q2], outputs = merged)

In [61]:
model.compile(optimizer = 'rmsprop', 
              loss = 'binary_crossentropy', metrics=['binary_crossentropy'])

In [63]:
# training
early_stopping = EarlyStopping(monitor = 'val_loss', min_delta = .001, 
                              patience = 10, verbose = 1, mode = 'auto')

model.fit([q1_data, q2_data], y_train, validation_split = .1,
          epochs=1000, batch_size=512, verbose = 1, callbacks = [early_stopping])

Train on 363861 samples, validate on 40429 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 00019: early stopping


<keras.callbacks.History at 0x1ee786940>

In [17]:
# serialize model to JSON
model_json = model.to_json()
with open("model_train_2.json", "w") as json_file:
    json_file.write(model_json)

  str(node.arguments) + '. They will not be included '
  str(node.arguments) + '. They will not be included '


In [18]:
# serialize weights to HDF5
model.save_weights("model_train_2.h5")
print("Saved model to disk")

Saved model to disk


In [6]:
json_file = open('model_train_2.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model_train_2.h5")
print("Loaded model from disk")

Loaded model from disk


In [39]:
# load id labels
test_ids = []
# question1_test = []
# question2_test = []

with open('./data/test.csv', encoding = 'utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter = ',')
    for row in reader:
        test_ids.append(row['test_id'])
#         question1_test.append(row['question1'])
#         question2_test.append(row['question2'])
# print ('Question pairs in testing dataset: %d' % len(question1_test))

In [8]:
# model evaluation on independent testing dataset
q1_test_data = pad_sequences(question1_test_ws, maxlen = MAX_SEQUENCE_LENGTH)
q2_test_data = pad_sequences(question2_test_ws, maxlen = MAX_SEQUENCE_LENGTH)
preds = loaded_model.predict([q1_test_data, q2_test_data])
preds_df = pd.DataFrame(test_ids, columns = ['test_id'])
preds_df = pd.DataFrame(preds, columns = ['is_duplicate'])
preds_df.to_csv('./data/submission1.csv', index=False)