In [None]:
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip snli_1.0.zip
!unzip glove.840B.300d.zip

In [None]:
from os.path import join as pjoin, isfile
import json
import numpy as np

TEXT_DATA_DIR = 'snli_1.0'

def load_data(tier):
    
    premise = []
    hypothseis = []
    label = []
    cnt = 0
    
    with open(pjoin(TEXT_DATA_DIR, 'snli_1.0_' + tier + '.jsonl')) as f:
        for line in f.readlines():
            d = json.loads(line)
            if d['gold_label'] != '-':
                cnt += 1
                premise.append(d['sentence1'])
                hypothseis.append(d['sentence2'])
                label.append(d['gold_label'])

    print('# of', tier, 'samples :', cnt, end=' | ')
    print('Entailment :', label.count('entailment'), '| Neutral :', label.count('neutral'), '| Contradiction :', label.count('contradiction'))
    return (premise, hypothseis, label)

train = load_data('train')
dev = load_data('dev')
test = load_data('test')

In [None]:
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
from keras.layers import LSTM, Input, Dot, Softmax, Multiply, Concatenate, Subtract, Dense, Lambda, Embedding
from keras.layers.wrappers import Bidirectional
from keras.models import Model, load_model

SentenceLen = 100
WordVecLen = 300

if not isfile('tokenizer.pickle'):
    tokenizer = Tokenizer(num_words=SentenceLen)
    tokenizer.fit_on_texts(train[0] + train[1] + dev[0] + dev[1] + test[0] + test[1])
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

tokenizer = pickle.load(open('tokenizer.pickle', 'rb'))

def PadSeq(text):
    sequences = tokenizer.texts_to_sequences(text)
    return pad_sequences(sequences, maxlen=SentenceLen)

In [None]:
if not isfile('embeddings.npy'):

    embeddings_index = {}
    f = open('glove.840B.300d.txt', encoding='utf8')
    for line in f:
        values = line.split()
        word = ' '.join(values[:-300])
        coefs = np.asarray(values[-300:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    word_index = tokenizer.word_index

    embedding_matrix = np.zeros((len(word_index) + 1, WordVecLen))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    np.save('embeddings.npy', embedding_matrix)

def load_embeddings():
    embedding_matrix = np.load('embeddings.npy')
    embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                WordVecLen,
                                weights=[embedding_matrix],
                                input_length=SentenceLen,
                                trainable=False)
    return embedding_layer


embedding_layer = load_embeddings()

In [None]:
def labelToVec(labels):
    vec = []
    for label in labels:
        if label == 'entailment':
            vec.append([1.0, 0.0, 0.0])
        elif label == 'contradiction':
            vec.append([0.0, 1.0, 0.0])
        elif label == 'neutral':
            vec.append([0.0, 0.0, 1.0])
        else:
            raise ValueError('Unknown label %s' % (label))
    return np.array(vec)

train_y = labelToVec(train[2])
train_x = [PadSeq(train[0]), PadSeq(train[1])]
dev_y = labelToVec(dev[2])
dev_x = [PadSeq(dev[0]), PadSeq(dev[1])]
test_y = labelToVec(test[2])
test_x = [PadSeq(test[0]), PadSeq(test[1])]

del train
del dev
del test
del tokenizer

In [None]:
def expand_rep(x, r, a):
    y = K.expand_dims(x, axis=a)
    y = K.repeat_elements(y, r, axis=a)
    return y

bilstm1 = Bidirectional(LSTM(300, return_sequences=True))
bilstm2 = Bidirectional(LSTM(300, return_sequences=True))

i1 = Input(shape=(SentenceLen,), dtype='float32')
i2 = Input(shape=(SentenceLen,), dtype='float32')

x1 = embedding_layer(i1)
x2 = embedding_layer(i2)

x1 = bilstm1(x1)
x2 = bilstm1(x2)

e = Dot(axes=2)([x1, x2])
e1 = Softmax(axis=2)(e)
e2 = Softmax(axis=1)(e)
e1 = Lambda(expand_rep, arguments={'r' : 2 * WordVecLen, 'a' : 3})(e1)
e2 = Lambda(expand_rep, arguments={'r' : 2 * WordVecLen, 'a' : 3})(e2)

_x1 = Lambda(expand_rep, arguments={'r' : K.int_shape(x1)[1], 'a' : 1})(x2)
_x1 = Multiply()([e1, _x1])
_x1 = Lambda(K.sum, arguments={'axis' : 2})(_x1)
_x2 = Lambda(expand_rep, arguments={'r' : K.int_shape(x2)[1], 'a' : 2})(x1)
_x2 = Multiply()([e2, _x2])
_x2 = Lambda(K.sum, arguments={'axis' : 1})(_x2)

m1 = Concatenate()([x1, _x1, Subtract()([x1, _x1]), Multiply()([x1, _x1])])
m2 = Concatenate()([x2, _x2, Subtract()([x2, _x2]), Multiply()([x2, _x2])])

y1 = bilstm2(m1)
y2 = bilstm2(m2)

mx1 = Lambda(K.max, arguments={'axis' : 1})(y1)
av1 = Lambda(K.mean, arguments={'axis' : 1})(y1)
mx2 = Lambda(K.max, arguments={'axis' : 1})(y2)
av2 = Lambda(K.mean, arguments={'axis' : 1})(y2)

y = Concatenate()([av1, mx1, av2, mx2])
y = Dense(1024, activation='tanh')(y)
y = Dense(1024, activation='tanh')(y)
y = Dense(3, activation='softmax')(y)

model = Model(inputs=[i1, i2], outputs=y)
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(train_x, train_y, epochs=10, validation_data=(dev_x, dev_y))

In [None]:
score = model.evaluate(test_x, test_y, verbose=1)
print('Test loss : ', score[0])
print('Test accuracy : ', score[1])

In [None]:
model.save_weights('NLI.h5')