In [None]:
from nltk.corpus import stopwords
import re
import pandas as pd
import json
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
MAX_SEQUENCE_LENGTH = 15
MAX_VOCAB_SIZE = 50000
EMBEDDING_DIM = 50


word2vec = {}
with open('glove.6B.50d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

stop = stopwords.words('english')

def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text

def remove_stop_words(text):
    return " ".join([item.lower() for item in text.split() if item not in stop])

def remove_non_ascii(text):
    return ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in text])

def process_train_data(df, filename):
    
    df = df[df.gold_label != "-"]
    df['gold_label'] = df['gold_label'].map({val: i for i, val in enumerate(df['gold_label'].unique())})
    df['sentence1'] = df['sentence1'].apply(remove_non_ascii)
    df['sentence1'] = df['sentence1'].apply(normalize_text)
    df['sentence1'] = df['sentence1'].apply(remove_stop_words)
    df['sentence1'] = df['sentence1'].str.replace('[^\w\s]','')

    df['sentence2'] = df['sentence2'].apply(remove_non_ascii)
    df['sentence2'] = df['sentence2'].apply(normalize_text)
    df['sentence2'] = df['sentence2'].apply(remove_stop_words)
    df['sentence2'] = df['sentence2'].str.replace('[^\w\s]','')
    texts = df['sentence1'].values + df['sentence2'].values
    
    tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
    
    tokenizer.fit_on_texts(texts)

    word2idx = tokenizer.word_index
    
    sentence1 = tokenizer.texts_to_sequences(df['sentence1'])
    sentence2 = tokenizer.texts_to_sequences(df['sentence2'])
    
    sentence1 = pad_sequences(sentence1, maxlen=MAX_SEQUENCE_LENGTH)
    sentence2 = pad_sequences(sentence2, maxlen=MAX_SEQUENCE_LENGTH)
    
    num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word2idx.items():
          if i < MAX_VOCAB_SIZE:
            embedding_vector = word2vec.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    
    sentence1_embeddings = []
    for sentence in sentence1:
        temp = []
        for val in sentence:
            temp.append(embedding_matrix[val])
        sentence1_embeddings.append(np.asarray(temp))
    sentence1_embeddings = np.asarray(sentence1_embeddings)
    
    sentence2_embeddings = []
    for sentence in sentence2:
        temp = []
        for val in sentence:
            temp.append(embedding_matrix[val])
        sentence2_embeddings.append(np.asarray(temp))
    sentence2_embeddings = np.asarray(sentence2_embeddings)
    
    with open('sentence1_embedd_%s'%filename,'w') as outfile:
        np.save(outfile, sentence1_embeddings.reshape(sentence1_embeddings.shape[0],-1))
        
    with open('sentence2_embedd_%s'%filename,'w') as outfile:
        np.save(outfile, sentence2_embeddings.reshape(sentence1_embeddings.shape[0],-1))
        
    labels = np.asarray(df['gold_label'])
    labels.astype(np.float32)
    
    true_label = np.zeros((labels.shape[0], 3))
    true_label[np.arange(labels.shape[0]), labels] = 1

    with open('label_%s'%filename, 'w') as outfile:
        np.save(outfile, true_label)
        
    return tokenizer, embedding_matrix
        
        
def process_test_data(df, filename, tokenizer, embedding_matrix):
    
    df = df[df.gold_label != "-"]
    df['gold_label'] = df['gold_label'].map({val: i for i, val in enumerate(df['gold_label'].unique())})
    df['sentence1'] = df['sentence1'].apply(remove_non_ascii)
    df['sentence1'] = df['sentence1'].apply(normalize_text)
    df['sentence1'] = df['sentence1'].apply(remove_stop_words)
    df['sentence1'] = df['sentence1'].str.replace('[^\w\s]','')

    df['sentence2'] = df['sentence2'].apply(remove_non_ascii)
    df['sentence2'] = df['sentence2'].apply(normalize_text)
    df['sentence2'] = df['sentence2'].apply(remove_stop_words)
    df['sentence2'] = df['sentence2'].str.replace('[^\w\s]','')
    
    ## tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
    # texts = df['sentence1'].values + df['sentence2'].values
    # tokenizer.fit_on_texts(texts)

    # word2idx = tokenizer.word_index
    
    sentence1 = tokenizer.texts_to_sequences(df['sentence1'])
    sentence2 = tokenizer.texts_to_sequences(df['sentence2'])
    
    sentence1 = pad_sequences(sentence1, maxlen=MAX_SEQUENCE_LENGTH)
    sentence2 = pad_sequences(sentence2, maxlen=MAX_SEQUENCE_LENGTH)
    
    """
    num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word2idx.items():
          if i < MAX_VOCAB_SIZE:
            embedding_vector = word2vec.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    """
    
    sentence1_embeddings = []
    for sentence in sentence1:
        temp = []
        for val in sentence:
            temp.append(embedding_matrix[val])
        sentence1_embeddings.append(np.asarray(temp))
    sentence1_embeddings = np.asarray(sentence1_embeddings)
    
    sentence2_embeddings = []
    for sentence in sentence2:
        temp = []
        for val in sentence:
            temp.append(embedding_matrix[val])
        sentence2_embeddings.append(np.asarray(temp))
    sentence2_embeddings = np.asarray(sentence2_embeddings)
    
    with open('sentence1_embedd_%s'%filename,'w') as outfile:
        np.save(outfile, sentence1_embeddings.reshape(sentence1_embeddings.shape[0],-1))
        
    with open('sentence2_embedd_%s'%filename,'w') as outfile:
        np.save(outfile, sentence2_embeddings.reshape(sentence1_embeddings.shape[0],-1))
        
    labels = np.asarray(df['gold_label'])
    labels.astype(np.float32)
    
    true_label = np.zeros((labels.shape[0], 3))
    true_label[np.arange(labels.shape[0]), labels] = 1
    
    with open('label_%s'%filename, 'w') as outfile:
        np.save(outfile, true_label)

In [None]:
filename = 'train'
with open(filename) as f:
    data = pd.DataFrame(json.loads(line) for line in f)
    df = data[['gold_label', 'sentence1', 'sentence2']]
tokenizer, embedding_matrix = process_train_data(df, filename)

In [None]:
filename = 'test'
with open(filename) as f:
    data = pd.DataFrame(json.loads(line) for line in f)
    df = data[['gold_label', 'sentence1', 'sentence2']]
process_test_data(df, filename, tokenizer, embedding_matrix)

In [None]:
filename = 'dev'
with open(filename) as f:
    data = pd.DataFrame(json.loads(line) for line in f)
    df = data[['gold_label', 'sentence1', 'sentence2']]
process_test_data(df, filename, tokenizer, embedding_matrix)

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd

In [None]:
train_labels = np.load('./label_train')
sentence1_train = np.load('./sentence1_embedd_train')
sentence2_train = np.load('./sentence2_embedd_train')

In [None]:
test_labels = np.load('./label_test')
sentence1_test = np.load('./sentence1_embedd_test')
sentence2_test = np.load('./sentence2_embedd_test')

In [None]:
dev_labels = np.load('./label_dev')
sentence1_dev = np.load('./sentence1_embedd_dev')
sentence2_dev = np.load('./sentence2_embedd_dev')

In [None]:
def getbatch(batch_size):
    while True:
        for i in range(len(train_labels)// batch_size):
            index = i * batch_size
            index_end = index + batch_size
            yield (sentence1_train[index: index_end], sentence2_train[index: index_end], train_labels[index: index_end])

In [None]:
nn1_input = 15 * 50
nn1_hidden1 = 500
nn1_output = 100
# nn1_output = 300

nn2_input = nn1_output * 2
nn2_hidden1 = 100
# nn2_hidden1 = 50
nn2_output = 3

learning_rate = 0.0001

batch_size = 128
num_steps = 4000

In [None]:
sentence1 = tf.placeholder(tf.float32, shape=(None, nn1_input))
sentence2 = tf.placeholder(tf.float32, shape=(None, nn1_input))

output = tf.placeholder(tf.float32, shape=(None, nn2_output))

In [None]:
nn1_weights = {
    "nn1_w1" : tf.Variable(tf.random_normal([nn1_input, nn1_hidden1])),
    "nn1_w2" : tf.Variable(tf.random_normal([nn1_hidden1, nn1_output])),
    
    "nn2_w1" : tf.Variable(tf.random_normal([nn2_input, nn2_hidden1])),
    "nn2_w2" : tf.Variable(tf.random_normal([nn2_hidden1, nn2_output]))
}

nn1_biases = {
    "nn1_b1" : tf.Variable(tf.random_normal([nn1_hidden1])),
    "nn1_b2" : tf.Variable(tf.random_normal([nn1_output])),
    
    "nn2_b1" : tf.Variable(tf.random_normal([nn2_hidden1])),
    "nn2_b2" : tf.Variable(tf.random_normal([nn2_output]))
}

In [None]:
def nn1_encode(x):
    val = tf.nn.sigmoid(tf.matmul(x, nn1_weights["nn1_w1"]) + nn1_biases["nn1_b1"])
    return tf.nn.sigmoid(tf.matmul(val, nn1_weights["nn1_w2"]) + nn1_biases["nn1_b2"])

e1 = nn1_encode(sentence1)
e2 = nn1_encode(sentence2)

In [None]:
nn2_input = tf.concat([e1, e2], 1)

In [None]:
def nn2_encode(x):
    val = tf.nn.sigmoid(tf.matmul(x, nn1_weights["nn2_w1"]) + nn1_biases["nn2_b1"])
    return tf.nn.sigmoid(tf.matmul(val, nn1_weights["nn2_w2"]) + nn1_biases["nn2_b2"])

In [None]:
predictions = nn2_encode(nn2_input)

In [None]:
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
    logits=predictions, labels=output))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

In [None]:
correct_pred = tf.equal(tf.argmax(predictions, 1), tf.argmax(output, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

init = tf.global_variables_initializer()

In [None]:

display_step = 1000
with tf.Session() as sess:
    sess.run(init)
    z = getbatch(batch_size)
    for step in range(1, num_steps+1):
        batch_sentence1, batch_sentence2, batch_y = z.next()
        sess.run(train_op, feed_dict={sentence1: batch_sentence1,sentence2: batch_sentence2, output: batch_y})
        if step % display_step == 0 or step == 1:
            loss, acc = sess.run([loss_op, accuracy], feed_dict={sentence1: batch_sentence1,sentence2: batch_sentence2, output: batch_y})
            print("Step " + str(step) + ", Step Loss= " + \
                  "{:.4f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

    print("Testing Accuracy:", \
       sess.run(accuracy, feed_dict={sentence1: sentence1_test,sentence2: sentence2_test, output: test_labels}))
    
    print("Dev Accuracy:", \
       sess.run(accuracy, feed_dict={sentence1: sentence1_dev,sentence2: sentence2_dev, output: dev_labels}))