In [None]:
import numpy as np
np.random.seed(3)

# Notation

X: input, left question + one unknown thing + right question
length N = len_l  + 1 + len_r


In [None]:
import tarfile
import zipfile
import urllib
import pickle
from os.path import isfile, isdir
from tqdm import tqdm_notebook as tqdm
import csv
import os

import math
import random
import re
from itertools import chain
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize

from nltk.corpus import wordnet

import gensim

In [None]:
from keras.regularizers import l2
from keras.callbacks import *
# from visualizer import *
from keras.models import *
from keras.optimizers import *
from keras.utils.np_utils import to_categorical, accuracy
from keras.layers.core import *
from keras.layers import Input, Embedding, LSTM, Dense, merge, TimeDistributed
import logging
from datetime import datetime

In [None]:
from reader import *
from myutils import *

# Model related functions

## Helper functions

In [None]:
def get_X_i(i):  # get element i
    return X[:,i,:]

def get_Y(X):
    return X[:, :xmaxlen, :]  # get first xmaxlen elem from time dim

def get_H(X):
    xmaxlen=K.params['xmaxlen']
    return X[:, xmaxlen:, :] # get elements L+1 to N

def get_H_n(X):
    ans = X[:, -1, :]  # get last element from time dim
    return ans

def get_R(X):
    Y, alpha = X[0], X[1]
    ans = K.T.batched_dot(Y, alpha)
    return ans


In [None]:
def build_model_attention(sentence_length=20, verbose=False):
    """
    https://arxiv.org/abs/1509.06664
    Reasoning about Entailment with Neural Attention
    Tim Rocktäschel, Edward Grefenstette, Karl Moritz Hermann, Tomáš Kočiský, Phil Blunsom
    Submitted on 22 Sep 2015 (v1), last revised 1 Mar 2016
    
    """

    inputs = Input(shape=(sentence_length*2+1, 300), name='inputs')
    #x = Embedding(output_dim=opts.emb, input_dim=opts.max_features, input_length=N, name='x')(main_input)
    drop_out = Dropout(0.1, name='dropout')(inputs)
    
    lstm_fwd = LSTM(150, return_sequences=True, name='lstm_fwd')(drop_out)
    lstm_bwd = LSTM(150, return_sequences=True, go_backwards=True, name='lstm_bwd')(drop_out)
    
    bilstm = layers.concatenate([lstm_fwd, lstm_bwd], name='bilstm')
    drop_out = Dropout(0.1)(bilstm)
    
    ###################################################
    # Attention 
    # M = tanh(Wy*Y + (Wh*hN)*eL)
    Y = Lambda(get_Y, name="Y", output_shape=(sentence_length, 300))(drop_out)
    W_Y = TimeDistributed(Dense(k, W_regularizer=l2(0.01)), name="weight_Y")(Y)
    
    hN = Lambda(get_H_n, output_shape=(300,), name="hN")(drop_out)   
    W_hN = Dense(k, W_regularizer=l2(0.01), name="W_hN")(hN)
    W_hN_eL = RepeatVector(L, name="W_hN_eL")(W_hN)   
    
    merged = layers.add([W_Y, W_hN_eL], name="merged")
    M = Activation('tanh', name="M")(merged)
    
    # alpha = softmax(wT * Mt)
    alpha_ = TimeDistributed(Dense(1, activation='linear'), name="alpha_")(M)
    flat_alpha = Flatten(name="flat_alpha")(alpha_)
    alpha = Dense(L, activation='softmax', name="alpha")(flat_alpha)
    
    # r = Y*alphaT
    Y_trans = Permute((2, 1), name="Y_trans")(Y)  # of shape (None,300,20)
    r_ = merge([Y_trans, alpha], output_shape=(k, 1), name="r_", mode=get_R)
    r = Reshape((k,), name="r")(r_)
    
    # h* = tanh (Wp*r + Wx * hN)
    W_r = Dense(k, W_regularizer=l2(0.01))(r)
    W_hN = Dense(k, W_regularizer=l2(0.01))(hN)
    merged = layers.add([W_r, W_hN])    
    h_star = Activation('tanh')(merged)    
    
    ####################################
    #Output
    output  = Dense(3, activation='softmax')(h_star)
    model = Model(input=[inputs], output=output)
    
    if verbose:
        model.summary()
    # plot(model, 'model.png')
    # # model.compile(loss={'output':'binary_crossentropy'}, optimizer=Adam())
    # model.compile(loss={'output':'categorical_crossentropy'}, optimizer=Adam(options.lr))
    model.compile(loss='categorical_crossentropy',optimizer=Adam(options.lr))
    return model

In [None]:
def build_model_attention_wordbyword(opts, verbose=False):
    """
    https://arxiv.org/abs/1509.06664
    Reasoning about Entailment with Neural Attention
    Tim Rocktäschel, Edward Grefenstette, Karl Moritz Hermann, Tomáš Kočiský, Phil Blunsom
    Submitted on 22 Sep 2015 (v1), last revised 1 Mar 2016    
    """
    
    k = 2 * opts.lstm_units  # 300
    L = opts.xmaxlen  # 20
    N = opts.xmaxlen + opts.ymaxlen + 1  # for delim
    print "x len", L, "total len", N
    print "k", k, "L", L

    inputs = Input(shape=(N, 300), name='inputs')
    #x = Embedding(output_dim=opts.emb, input_dim=opts.max_features, input_length=N, name='x')(main_input)
    drop_out = Dropout(0.1, name='dropout')(inputs)
    
    lstm_fwd = LSTM(opts.lstm_units, return_sequences=True, name='lstm_fwd')(drop_out)
    lstm_bwd = LSTM(opts.lstm_units, return_sequences=True, go_backwards=True, name='lstm_bwd')(drop_out)
    
    bilstm = merge([lstm_fwd, lstm_bwd], name='bilstm', mode='concat')
    drop_out = Dropout(0.1)(bilstm)
    
    ###################################################
    # Word_by_Word Attention 
    # Mt = tanh(Wy*Y + (Wh*ht + Wr*rt-1)*eL) 
    # alpha_t = softmax(wT * Mt)
    # rt = Y*alphaT + tanh(Wt*rt-1)

    
    # 1) M1 = tanh(Wy*Y + Wh*h1*eL)    
    ## Wy*Y
    Y = Lambda(get_Y, arguments={"xmaxlen": L}, name="Y", output_shape=(L, k))(drop_out)
    W_Y = TimeDistributed(Dense(k, W_regularizer=l2(0.01)), name="W_Y")(Y)
    
    ##  Wh*h1*eL
    H = Lambda(get_H, output_shape=(N-L, k),name="H")(dropout)
    W_H = TimeDistributed(Dense(k,W_regularizer=l2(0.01),name="W_ht"))(H)
    W_h = [Lambda(get_X(0), output_shape=(k,))(W_H)]
    W_h_eL = [RepeatVector(L)(W_h1[0])]
    
    ## merge
    merge = merge([W_h1_eL[0], W_Y],mode='sum')
    M = [Activation('tanh')(merge)]
    
    #) 2) initialize alpha and r 
    Distributed_Dense_init_weight = ((2.0/np.sqrt(k)) * np.random.rand(k,1)) - (1.0 / np.sqrt(k))
    Distributed_Dense_init_bias = ((2.0) * np.random.rand(1,)) - (1.0)
    alpha = [Reshape((L, 1), input_shape=(L,))(Activation("softmax")(Flatten()(TimeDistributed(Dense(1, weights=[Distributed_Dense_init_weight, Distributed_Dense_init_bias]), name='alpha1')(M[0]))))]

    Join_Y_alpha = [merge([Y, alpha[0]],mode='concat',concat_axis=2)]    
    r = [Lambda(get_R, output_shape=(k,),name="r1")(Join_Y_alpha[0])]
    
    ##############??????????????????????####################
    Tan_Wr_init_weight = 2*(1/np.sqrt(k))*np.random.rand(k,k) - (1/np.sqrt(k))
    Tan_Wr_init_bias = 2*(1/np.sqrt(k))*np.random.rand(k,) - (1/np.sqrt(k))
    Tan_Wr = [Dense(k,W_regularizer=l2(0.01),activation='tanh', name='Tan_Wr1', weights=[Tan_Wr_init_weight, Tan_Wr_init_bias])(r[0])]

    Wr_init_weight = 2*(1/np.sqrt(k))*np.random.rand(k,k) - (1/np.sqrt(k))
    Wr_init_bias = 2*(1/np.sqrt(k))*np.random.rand(k,) - (1/np.sqrt(k))
    Wr = [Dense(k,W_regularizer=l2(0.01), name='Wr1', weights=[Wr_init_weight, Wr_init_bias])(r[0])]
    Wr_cross_e = [RepeatVector(L,name="Wr_cross_e")(Wr[0])]
    
    #3) update

    star_r = []

    for i in range(2,N-L+1):
        f = get_X(i-1)
        W_h.append( Lambda(f, output_shape=(k,))(W_H))
        W_h_eL.append( RepeatVector(L)(W_h[i-1]) )

        Sum_Wh_lp_cross_e_WY.append( merge([Wh_lp_cross_e[i-1], WY, Wr_cross_e[i-2]],mode='sum') )
        M.append( Activation('tanh')(  Sum_Wh_lp_cross_e_WY[i-1] ) )
        alpha.append( Reshape((L, 1), input_shape=(L,))(Activation("softmax")(Flatten()(TimeDistributed(Dense(1, weights=[Distributed_Dense_init_weight, Distributed_Dense_init_bias]), name='alpha'+str(i))(M[i-1])))) )

        Join_Y_alpha.append( merge([Y, alpha[i-1]],mode='concat',concat_axis=2) )
        star_r.append( Lambda(get_R, output_shape=(k,),name="r"+str(i))(Join_Y_alpha[i-1]) )
        r.append( merge([star_r[i-2], Tan_Wr[i-2]], mode='sum') )

        if i != (N-L):
            Tan_Wr.append( Dense(k,W_regularizer=l2(0.01),activation='tanh', name='Tan_Wr'+str(i),weights=[Tan_Wr_init_weight, Tan_Wr_init_bias])(r[i-1]) )
            Wr.append( Dense(k,W_regularizer=l2(0.01), name='Wr'+str(i),weights=[Wr_init_weight, Wr_init_bias])(r[i-1]) )
            Wr_cross_e.append(RepeatVector(L)(Wr[i-1]))        
    
    # ok! h* = tanh (W*rN + W*hN)
    W_rN = Dense(k, W_regularizer=l2(0.01))(r[N-L-1])
    W_hN = Dense(k, W_regularizer=l2(0.01))(hN)
    merged = merge([W_rN, W_hN], mode='sum')    
    h_star = Activation('tanh')(merged)
    
    
    ####################################
    #Output
    out  = Dense(3, activation='softmax')(h_star)
    model = Model(input=[inputs], output=out)
    
    if verbose:
        model.summary()
    # plot(model, 'model.png')
    # # model.compile(loss={'output':'binary_crossentropy'}, optimizer=Adam())
    # model.compile(loss={'output':'categorical_crossentropy'}, optimizer=Adam(options.lr))
    model.compile(loss='categorical_crossentropy',optimizer=Adam(options.lr))
    return model

# Data related functions

## helper functions

In [None]:
def clean_question(question):
    """remove non-letters, return a list of words"""
    if type(question) != float:
        question =  re.sub('[^a-zA-Z0-9 -]', " ", question)
        return question.lower().split()
    else:
        return ""

def get_synonyms(word):
    #synonyms = wordnet.synsets(word)
    #return list(set(chain.from_iterable([word.lemma_names() for word in synonyms])))
    synonyms = []
    for word in w2v.wv.most_similar(word):
        synonyms.append(word[0])    
    return synonyms

def augment_question(question, k=0.2):
    if not question:
        return question
    new_question = []
    words_to_change = np.random.choice(question, int(k*len(question)))
    for idx, word in enumerate(question):
        if word in words_to_change:
            synonyms = get_synonyms(word)
            if synonyms != []:
                synonym = np.random.choice(synonyms)
                new_question.append(synonym)
            else:
                new_question.append(word)
        else:
            new_question.append(word)
    return new_question

def question_to_vectors_glove(question, length = 20):
    vectors = np.zeros((length,300),dtype = "float32")    
    number_of_words = 0
    
    for idx, word in enumerate(question):        
        if word in glove_words:            
            word_idx = glove_words[word]
            vector = glove_vectors[word_idx]
            vectors[number_of_words] = vector
            number_of_words += 1
            
        if number_of_words == length:
            break          
        
    return vectors

In [None]:
def load_embedding_models():
    with open("data/glove", 'rb') as f:
        glove = pickle.load(f)
    glove_words = glove[0]
    glove_vectors = normalize(glove[1])
    
    _fname = "data/GoogleNews-vectors-negative300.bin"
    w2v = gensim.models.Word2Vec.load_word2vec_format(_fname, binary=True)
    return glove_words, glove_vectors, w2v

In [None]:
def load_original_data():
    train_full = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    train_full_y = train_full['is_duplicate']
    train_full_q1 = train_full['question1']
    train_full_q2 = train_full['question2']
    test_q1 = test['question1']
    test_q2 = test['question2']
    train_q1, cv_q1, train_q2, cv_q2, train_y, cv_y = train_test_split(train_full_q1, train_full_q2, train_full_y, test_size = 0.2, random_state=3)
    train_q1, train_q2, train_y = shuffle(train_q1, train_q2, train_y, random_state=3)
    cv_q1, cv_q2, cv_y = shuffle(cv_q1, cv_q2, cv_y, random_state=3)
    return train_q1, train_q2, train_y, cv_q1, cv_q2, cv_y, test_q1, test_q2

In [None]:
def data_generator_glove(questions1, questions2, y=None, aug_prob=0, batch_size=1024):
    
    sample_size = len(questions1)
    batch_slices = [slice(i, i + batch_size) for i in range(0, sample_size, batch_size)]
    
    while True:
        for batch in batch_slices:
            batch_vectors1 = []
            batch_vectors2 = []
            batch_y = y[batch]

            for question1 in questions1[batch]:
                prob = np.random.rand()                
                question1_new = clean_question(question1)
                
                if prob < aug_prob: 
                    question1_new = augment_question(question1_new)                   

                vector = question_to_vectors_glove(question1_new)
                batch_vectors1.append(vector)
            
            for question2 in questions2[batch]:
                prob = np.random.rand()                
                question2_new = clean_question(question2)
                
                if prob < aug_prob: 
                    question2_new = augment_question(question2_cleaned)                   

                vector = question_to_vectors_glove(question2_new)
                batch_vectors2.append(vector)         

            if y is None:
                # test batch without labels
                yield [np.array(batch_vectors1), np.array(batch_vectors2)]
            else:
                yield [np.array(batch_vectors1), np.array(batch_vectors2)], np.array(batch_y)

# train model

## helper functions

In [None]:
def compute_acc(X, Y, vocab, model, opts):
    scores = model.predict(X, batch_size=options.batch_size)
    prediction = np.zeros(scores.shape)
    for i in range(scores.shape[0]):
        l = np.argmax(scores[i])
        prediction[i][l] = 1.0
    assert np.array_equal(np.ones(prediction.shape[0]), np.sum(prediction, axis=1))
    plabels = np.argmax(prediction, axis=1)
    tlabels = np.argmax(Y, axis=1)
    acc = accuracy(tlabels, plabels)
    return acc, acc


def getConfig(opts):
    conf = [opts.xmaxlen,
            opts.ymaxlen,
            opts.batch_size,
            opts.emb,
            opts.lr,
            opts.samples,
            opts.lstm_units,
            opts.epochs]
    if opts.no_padding:
        conf.append("no-pad")
    return "_".join(map(lambda x: str(x), conf))


def save_model(model, wtpath, archpath, mode='yaml'):
    if mode == 'yaml':
        yaml_string = model.to_yaml()
        open(archpath, 'w').write(yaml_string)
    else:
        with open(archpath, 'w') as f:
            f.write(model.to_json())
    model.save_weights(wtpath)


def load_model(wtpath, archpath, mode='yaml'):
    if mode == 'yaml':
        model = model_from_yaml(open(archpath).read())  # ,custom_objects={"MyEmbedding": MyEmbedding})
    else:
        with open(archpath) as f:
            model = model_from_json(f.read())  # , custom_objects={"MyEmbedding": MyEmbedding})
    model.load_weights(wtpath)
    return model


def concat_in_out(X, Y, vocab):
    numex = X.shape[0]  # num examples
    glue = vocab["delimiter"] * np.ones(numex).reshape(numex, 1)
    inp_train = np.concatenate((X, glue, Y), axis=1)
    return inp_train


def setup_logger(config_str):
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                        datefmt='%m-%d %H:%M',
                        filename=datetime.now().strftime('mylogfile_%H_%M_%d_%m_%Y.log'),
                        filemode='w')

In [None]:
train_q1, train_q2, train_y, cv_q1, cv_q2, cv_y, test_q1, test_q2 = load_original_data()

In [None]:
glove_words, glove_vectors, w2v = load_embedding_models()

In [None]:
def train_model():
    train = 
    cv = 
    test = 