In [21]:
from tensorflow.python.framework.ops import disable_eager_execution, enable_eager_execution
enable_eager_execution()

import os
import numpy as np
import pandas as pd
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import seaborn as sns

# tensorflow imports
from tensorflow import int32
from tensorflow.keras import regularizers
from tensorflow.keras.backend import placeholder
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers import Embedding, Dense
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Flatten, Softmax
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Activation, Add
import tensorflow.keras.backend as K
import tensorflow as tf
import re
import nltk
from gensim.models import word2vec

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)


In [22]:
train_df = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test_df = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
len(train_df)

In [23]:
train_df = train_df[train_df['language'] == 'English']
test_df = test_df[test_df['language'] == 'English']
len(train_df)

In [24]:
temp = pd.DataFrame()
temp['premise'] = train_df['premise']
temp['hypothesis'] = train_df['hypothesis']

In [25]:
import nltk
nltk.download('stopwords')

In [26]:
STOP_WORDS = nltk.corpus.stopwords.words()

def clean_sentence(val):
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

temp['premise'] =  temp['premise'].apply(clean_sentence)
temp['hypothesis'] =  temp['hypothesis'].apply(clean_sentence)


In [27]:
def build_corpus(data):
    corpus = []
    for col in ['premise', 'hypothesis']:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)
            
    return corpus

corpus = build_corpus(temp)        


In [28]:
def tsne_plot(model, word_limit=100):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in list(model.wv.key_to_index)[:word_limit]:
        tokens.append(model.wv[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()


In [29]:
encoder = word2vec.Word2Vec(corpus, vector_size=100, window=20, min_count=2, workers=8)

In [30]:
tsne_plot(encoder, word_limit=1000)

In [31]:
result = []
for word in np.concatenate(corpus):
    if word in encoder.wv:
        result.append(1)
    else:
        result.append(0)
        
print("%d/%d" % (sum(result), len(result)))

def encode_f(sentence):
    words = sentence.lower().split()
    result = []
    for word in words:
        if word in encoder.wv:
            result.append(encoder.wv[word])
    return result

train_df.premise = train_df.premise.apply(encode_f)
train_df.hypothesis = train_df.hypothesis.apply(encode_f)

test_df.premise = test_df.premise.apply(encode_f)
test_df.hypothesis = test_df.hypothesis.apply(encode_f)

In [32]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_x1 = pad_sequences(train_df.premise.to_numpy(), dtype='float', maxlen=70)
train_x2 = pad_sequences(train_df.hypothesis.to_numpy(), dtype='float', maxlen=70)
train_y = train_df.label.to_numpy()

test_x1 = pad_sequences(test_df.premise.to_numpy(), dtype='float', maxlen=70)
test_x2 = pad_sequences(test_df.hypothesis.to_numpy(), dtype='float', maxlen=70)

In [33]:
print(train_df.premise.to_numpy()[0])
print(train_x1[0])

For the Actual LSTM implementation, we will be using a Siamese BiLSTM network model described in [this](https://github.com/GKarmakar/siamese-lstm/blob/master/docs/W16-1617.pdf) paper by Paul Necoiu, Maarten Versteegh and Mihai Rotaru. A base implementation can also be found by GKarmakar [here](https://github.com/GKarmakar/deep-siamese-text-similarity/blob/master/siamese_network_semantic.py). While we base our approach on these two previous works, we propose a number of adjustments to the model, and also define a more abstract setup to allow fine tuning and exploration.

Firstly, here is the SiameseBiLSTM implementation. This is going to be the core of the model, consisting of the two BiLSTM RNNs that parse the hypothesis and the premise respectively, together with the energy function that produces the final output based on the RNN outputs.

In [34]:
class SiameseBiLSTM:
    args = None
    ff_layer = None

    def __init__(self, args):
        self.args = args
        self.define_bilstm()

        self.input_p1 = placeholder(dtype=int32, shape=[None, args['max_sentence_size']], name='input_p1')
        self.input_p2 = placeholder(dtype=int32, shape=[None, args['max_sentence_size']], name='input_p2')
        self.input_y = placeholder(dtype=int32, shape=[None], name='input_y')

    def define_bilstm(self):
        """
        Defines a BiLSTM network that will be used to handle one of the input sentences. There will be two
        such networks in the final model, one for the premise and one for the hypothesis.
        """

        # n layers of BiLSTM RNNs of size 64x2
        feed_forward_layer = []
        for num_units in self.args["num_units"][:-1]:
            # This will use tanh as the default activation
            feed_forward_layer.append(Bidirectional(
                LSTM(num_units,
                     activation='tanh',
                     recurrent_activation='sigmoid',
#                      dropout=0.0,
#                      recurrent_dropout=0.05,
#                      kernel_regularizer=regularizers.l2(0.03),
                     return_sequences=True)))

        self.ff_layer = feed_forward_layer

        self.top_layer = LSTM(self.args["num_units"][-1],
                                  activation='hard_sigmoid',
#                                   dropout=0.0,
#                                   recurrent_dropout=0.05,
                                  kernel_regularizer=regularizers.l2(0.03),
                                  return_sequences=True)

    @staticmethod
    def energy_function(a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


siamese_bilstm = SiameseBiLSTM({
    'num_units': [512, 256, 128, 128],
    'ff_layers': 3,
    'max_sentence_size': 70
})



--- to complete dropout, train/test shuffler and l2 optimization---

Finally, the model that puts all of these components together to be able to train and test

In [None]:
class Diff(Add):
    def _merge_function(self, inputs):
        return K.sum(K.abs(inputs[1] - inputs[0]), axis=-1, keepdims=True)

    # def _compute_elemwise_op_output_shape(self, shape1, shape2):
    #     return None, None, 1

    def compute_output_shape(self, input_shape):
        output_shape = (1,)

        batch_sizes = [s[0] for s in input_shape if s is not None]
        batch_sizes = set(batch_sizes)
        batch_sizes -= {None}
        if len(batch_sizes) == 1:
            output_shape = (list(batch_sizes)[0],) + output_shape
        else:
            output_shape = (None,) + output_shape
        return output_shape


class CosineDist(Diff):
    def _merge_function(self, inputs):
        l, r = inputs[0], inputs[1]
        num = K.sum((l * r), keepdims=True, axis=-1)
        den = K.sqrt(K.sum(K.square(l), keepdims=True, axis=-1)) * K.sqrt(K.sum(K.square(r), keepdims=True, axis=-1))
        den = K.clip(den, min_value=1e-4, max_value=float('inf'))
        sim = num / den
        return K.ones_like(sim) - sim

    def _compute_elemwise_op_output_shape(self, shape1, shape2):
        return None, 1

    def compute_output_shape(self, input_shape):
        return None, 1
    
def mean_rectified_infinity_loss(y_true, y_pred):
    k = 5.0

    cond = K.equal(y_true, K.zeros_like(y_true))
    if K.backend() == 'tensorflow':
        import tensorflow as tf
        # err = tf.where(cond, K.square(y_pred - y_true), K.exp(-y_pred / k))
        err = tf.where(cond, K.square(y_pred),
                       k/K.square(K.clip(y_pred, min_value=K.epsilon(), max_value=float('inf'))))
    else:
        from theano.ifelse import ifelse
        # err = ifelse(cond, K.square(y_pred - y_true), K.exp(-y_pred / k))
        err = ifelse(cond, K.square(y_pred),
                     k/K.square(K.clip(y_pred, min_value=K.epsilon(), max_value=float('inf'))))

    return K.mean(err, axis=-1)

def contrastive_loss(y, preds, margin=1):
    y = tf.cast(y, preds.dtype)

    squaredPreds = K.square(preds)
    squaredMargin = K.square(K.maximum(margin - preds, 0))
    loss = K.mean(y * squaredPreds + (1 - y) * squaredMargin)
    return loss

class ContradictoryModel:

    def __init__(self, _encoder, data_handler, rnn, args):
        self.args = args
        self.encoder = _encoder
        self.data_handler = data_handler
        self.rnn = rnn
        self.optimizer = None
        self.loss_function = contrastive_loss
        self.sequence = Sequential()
        additional_metrics = ['accuracy']
        # self.sequence.add(Embedding(args['num_distinct_words'],
        #                          args['embedding_output_dims'],
        #                          input_length=args['max_sequence_length']))
        for layer in self.rnn.ff_layer:
            self.sequence.add(layer)
#         self.sequence.add(self.rnn.top_layer)
        self.sequence.add(Dense(128, activation='tanh', kernel_regularizer=regularizers.l2(0.03)))
        
        p1_in = Input((70, 100), name="p1input")
        p1_input = self.sequence(p1_in)
        p2_in = Input((70, 100), name="p2input")
        p2_input = self.sequence(p2_in)

        merged = CosineDist(name='merge')([p1_input, p2_input])
        out = Activation('relu', name='out')(merged)
        out = Flatten()(out)
        out = Dense(3)(out)
        out = Softmax()(out)

        self.model = Model(inputs=(p1_in, p2_in), outputs=out)

        self.model.compile(loss=self.loss_function, metrics=additional_metrics)
        print("Model compiled successfully.")
        self.sequence.summary()

    def fit(self, x1, x2, y):
        history = self.model.fit([x1, x2], y,
                                    batch_size=self.args['batch_size'],
                                    epochs=self.args['number_of_epochs'],
                                    verbose=1, validation_split=self.args['validation_split'])

    def test(self, x1, x2, y):
        test_results = self.model.evaluate([x1, x2], y, verbose=False)
        print(f'Test results - Loss: {test_results[0]} - Accuracy: {100 * test_results[1]}%')

    def predict(self, p1, p2):
        x1, x2 = pad_sequences([encode_f(p1), encode_f(p2)], dtype='float', maxlen=70)
        x1 = x1.reshape([1, 70, 100])
        x2 = x2.reshape([1, 70, 100])

        print(self.model.predict([x1, x2]))



model = ContradictoryModel(encoder, None, siamese_bilstm, {
    'num_distinct_words': 1000,
    'embedding_output_dims': 64,
    'max_sequence_length': 100,
    'batch_size': 128,
    'validation_split': 0.2,
    'number_of_epochs': 1
})

model.fit(train_x1, train_x2, train_y)
model.predict("Steps are initiated to allow program board membership to reflect the clienteligible community and include representatives from the funding community, corporations and other partners.", "There's enough room for 35-40 positions on the board.")
# model.test(train_x1[5501:], train_x2[5501:], train_y[5501:])
