# Feature: Out-Of-Fold Predictions from a Siamese LSTM with Attention

## Imports

This utility package imports `numpy`, `pandas`, `matplotlib` and a helper `kg` module into the root namespace.

In [1]:
import os
import gc
import string
import pickle
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [2]:
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping, ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Config

In [3]:
INPUT_PATH = os.path.join(os.pardir, 'Datasets')
OUT_PATH = os.path.join(os.pardir, 'Datasets')
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'
SAMPLE_SIZE = 50000

EMBEDDING_DIMENSIONS = 10

RANDOM_SEED = 42

In [4]:
np.random.seed(RANDOM_SEED)

## Data loading

In [5]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [None]:
# Loading data
df = pd.read_csv(os.path.join(INPUT_PATH, TRAIN_FILE), nrows=SAMPLE_SIZE)
df.set_index('id', inplace=True)
df.fillna('Empty question', inplace=True)
df['question1'] = df['question1'].apply(spacy_tokenizer)
df['question2'] = df['question2'].apply(spacy_tokenizer)
df.head()

In [None]:
# Loading data
test_df = pd.read_csv(os.path.join(INPUT_PATH, TEST_FILE), nrows=SAMPLE_SIZE)
test_df.set_index('test_id', inplace=True)
test_df.fillna('Empty question', inplace=True)
test_df['question1'] = test_df['question1'].apply(spacy_tokenizer)
test_df['question2'] = test_df['question2'].apply(spacy_tokenizer)
test_df.head()

## Create embedding

Word embedding lookup matrix.

In [None]:
corpus = pd.concat([df['question1'], df['question2']])
w2v_model = Word2Vec(
    corpus.str.split(' ').tolist(), 
    size=EMBEDDING_DIMENSIONS, 
    window=5, 
    min_count=1, 
)
pickle.dump(w2v_model, open('gensim_w2v_model.pkl', 'wb'))

In [None]:
embedding_layer = w2v_model.wv.get_keras_embedding()

In [None]:
# embedding_matrix = kg.io.load(project.aux_dir + 'fasttext_vocab_embedding_matrix.pickle')

## Word sequences

Padded sequences of word indices for every question.

In [None]:
max_string_length = corpus.str.split(' ').apply(len).max()
pickle.dump(max_string_length, open('max_question_length.pkl', 'wb'))

In [None]:
def get_padded_index_sequence(array_of_word_lists, word2vec_model, padding_index, pad_length):
    source_word_indices = []
    for i in range(len(array_of_word_lists)):
        source_word_indices.append([])
        for j in range(len(array_of_word_lists[i])):
            word = array_of_word_lists[i][j]
            if word in word2vec_model.wv.vocab:
                word_index = word2vec_model.wv.vocab[word].index
                source_word_indices[i].append(word_index)
            else:
                # Do something. For example, leave it blank or replace with padding character's index.
                source_word_indices[i].append(padding_index)
        while len(source_word_indices[i]) < pad_length:
            source_word_indices[i].append(padding_index)
    return np.array(source_word_indices)

In [None]:
X_train_q1 = get_padded_index_sequence(
    df['question1'].str.split(' ').tolist(), 
    w2v_model, 
    0, 
    max_string_length
)
X_train_q2 = get_padded_index_sequence(
    df['question2'].str.split(' ').tolist(), 
    w2v_model, 
    0, 
    max_string_length
)
X_test_q1 = get_padded_index_sequence(
    test_df['question1'].str.split(' ').tolist(), 
    w2v_model, 
    0, 
    max_string_length
)
X_test_q2 = get_padded_index_sequence(
    test_df['question2'].str.split(' ').tolist(), 
    w2v_model, 
    0, 
    max_string_length
)

In [None]:
#X_train_q1 = kg.io.load(project.preprocessed_data_dir + 'sequences_q1_fasttext_train.pickle')
#X_train_q2 = kg.io.load(project.preprocessed_data_dir + 'sequences_q2_fasttext_train.pickle')

In [None]:
#X_test_q1 = kg.io.load(project.preprocessed_data_dir + 'sequences_q1_fasttext_test.pickle')
#X_test_q2 = kg.io.load(project.preprocessed_data_dir + 'sequences_q2_fasttext_test.pickle')

In [None]:
y_train = df['is_duplicate'].values #kg.io.load(project.features_dir + 'y_train.pickle')

In [None]:
del df, test_df
gc.collect()

Word embedding properties.

In [None]:
# EMBEDDING_DIM = embedding_matrix.shape[-1]
# VOCAB_LENGTH = embedding_matrix.shape[0]
MAX_SEQUENCE_LENGTH = max_string_length  # X_train_q1.shape[-1]

## Define models

In [None]:
def contrastive_loss(y_true, y_pred):
    """
    Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    """    
    margin = 1
    return K.mean((1 - y_true) * K.square(y_pred) +
                   y_true * K.square(K.maximum(margin - y_pred, 0)))

In [None]:
class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification" by using a context
    vector to assist the attention.
    
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.

    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    
    The dimensions are inferred based on the output shape of the RNN.
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
    """

    def __init__(self, init='glorot_uniform',
                 kernel_regularizer=None, bias_regularizer=None,
                 kernel_constraint=None, bias_constraint=None,  **kwargs):
        
        self.supports_masking = True
        self.init = initializers.get(init)
        self.kernel_initializer = initializers.get('glorot_uniform')

        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)

        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)

        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(
            (input_shape[-1], 1),
            initializer=self.kernel_initializer,
            name='{}_W'.format(self.name),
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint
        )
        self.b = self.add_weight(
            (input_shape[1],),
            initializer='zero',
            name='{}_b'.format(self.name),
            regularizer=self.bias_regularizer,
            constraint=self.bias_constraint
        )
        self.u = self.add_weight(
            (input_shape[1],),
            initializer=self.kernel_initializer,
            name='{}_u'.format(self.name),
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint
        )
        self.built = True

    def compute_mask(self, input, mask):
        return None

    def call(self, x, mask=None):
        multdata = K.dot(x, self.kernel)     # (x, 40, 300) * (300, 1) => (x, 40, 1)
        multdata = K.squeeze(multdata, -1)   # (x, 40)
        multdata = multdata + self.b         # (x, 40) + (40,)

        multdata = K.tanh(multdata)          # (x, 40)

        multdata = multdata * self.u         # (x, 40) * (40, 1) => (x, 1)
        multdata = K.exp(multdata)           # (x, 1)

        # Apply mask after the exp. will be re-normalized next.
        if mask is not None:
            mask = K.cast(mask, K.floatx())  # (x, 40)
            multdata = mask * multdata       # (x, 40) * (x, 40, )

        # In some cases, especially in the early stages of training, the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        multdata /= K.cast(K.sum(multdata, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        multdata = K.expand_dims(multdata)
        weighted_input = x * multdata
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1],)

In [None]:
def create_model(params):
#     embedding_layer = Embedding(
#         VOCAB_LENGTH,
#         EMBEDDING_DIM,
#         weights=[embedding_matrix],
#         input_length=MAX_SEQUENCE_LENGTH,
#         trainable=False,
#     )
    lstm_layer = LSTM(
        params['num_lstm'],
        dropout=params['lstm_dropout_rate'],
        recurrent_dropout=params['lstm_dropout_rate'],
        return_sequences=True,
    )
    attention_layer = AttentionWithContext()

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = attention_layer(lstm_layer(embedded_sequences_1))

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = attention_layer(lstm_layer(embedded_sequences_2))

    merged = concatenate([x1, y1])
    merged = Dropout(params['dense_dropout_rate'])(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(params['num_dense'], activation='relu')(merged)
    merged = Dropout(params['dense_dropout_rate'])(merged)
    merged = BatchNormalization()(merged)

    output = Dense(1, activation='sigmoid')(merged)

    model = Model(
        inputs=[sequence_1_input, sequence_2_input],
        outputs=output
    )

    model.compile(
        loss=contrastive_loss,
        optimizer='nadam',
        metrics=['accuracy']
    )

    return model

In [None]:
def predict(model, X_q1, X_q2):
    """
    Mirror the pairs, compute two separate predictions, and average them.
    """
    
    y1 = model.predict([X_q1, X_q2], batch_size=1024, verbose=1).reshape(-1)   
    y2 = model.predict([X_q2, X_q1], batch_size=1024, verbose=1).reshape(-1)    
    return (y1 + y2) / 2

## Partition the data

In [None]:
NUM_FOLDS = 5

In [None]:
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

Create placeholders for out-of-fold predictions.

In [None]:
y_train_oofp = np.zeros_like(y_train, dtype='float64')

In [None]:
y_test_oofp = np.zeros((len(X_test_q1), NUM_FOLDS))

## Define hyperparameters

In [None]:
BATCH_SIZE = 2048

In [None]:
MAX_EPOCHS = 200

Best values picked by Bayesian optimization.

In [None]:
model_params = {
    'dense_dropout_rate': 0.164,
    'lstm_dropout_rate': 0.324,
    'num_dense': 132,
    'num_lstm': 254,
}

The path where the best weights of the current model will be saved.

In [None]:
model_checkpoint_path = project.temp_dir + 'fold-checkpoint-' + feature_list_id + '.h5'

## Fit the folds and compute out-of-fold predictions

In [None]:
%%time

# Iterate through folds.
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_q1, y_train)):
    
    # Augment the training set by mirroring the pairs.
    X_fold_train_q1 = np.vstack([X_train_q1[ix_train], X_train_q2[ix_train]])
    X_fold_train_q2 = np.vstack([X_train_q2[ix_train], X_train_q1[ix_train]])

    X_fold_val_q1 = np.vstack([X_train_q1[ix_val], X_train_q2[ix_val]])
    X_fold_val_q2 = np.vstack([X_train_q2[ix_val], X_train_q1[ix_val]])

    # Ground truth should also be "mirrored".
    y_fold_train = np.concatenate([y_train[ix_train], y_train[ix_train]])
    y_fold_val = np.concatenate([y_train[ix_val], y_train[ix_val]])
    
    print()
    print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')
    print()
    
    # Compile a new model.
    model = create_model(model_params)

    # Train.
    model.fit(
        [X_fold_train_q1, X_fold_train_q2], y_fold_train,
        validation_data=([X_fold_val_q1, X_fold_val_q2], y_fold_val),

        batch_size=BATCH_SIZE,
        epochs=MAX_EPOCHS,
        verbose=1,
        
        callbacks=[
            # Stop training when the validation loss stops improving.
            EarlyStopping(
                monitor='val_loss',
                min_delta=0.001,
                patience=3,
                verbose=1,
                mode='auto',
            ),
            # Save the weights of the best epoch.
            ModelCheckpoint(
                model_checkpoint_path,
                monitor='val_loss',
                save_best_only=True,
                verbose=2,
            ),
        ],
    )
        
    # Restore the best epoch.
    model.load_weights(model_checkpoint_path)
    
    # Compute out-of-fold predictions.
    y_train_oofp[ix_val] = predict(model, X_train_q1[ix_val], X_train_q2[ix_val])
    y_test_oofp[:, fold_num] = predict(model, X_test_q1, X_test_q2)
    
    # Clear GPU memory.
    K.clear_session()
    del X_fold_train_q1
    del X_fold_train_q2
    del X_fold_val_q1
    del X_fold_val_q2
    del model
    gc.collect()

In [None]:
cv_score = log_loss(y_train, y_train_oofp)
print('CV score:', cv_score)

## Save features

In [None]:
features_train = y_train_oofp.reshape((-1, 1))

In [None]:
features_test = np.mean(y_test_oofp, axis=1).reshape((-1, 1))

In [None]:
print('X train:', features_train.shape)
print('X test: ', features_test.shape)

In [None]:
feature_names = [feature_list_id]

In [None]:
project.save_features(features_train, features_test, feature_names, feature_list_id)

## Explore

In [None]:
pd.DataFrame(features_test).plot.hist()