In [1]:
import numpy as np
np.random.seed(3)

# Notation

X: input, left question + one unknown thing + right question
length N = len_l  + 1 + len_r


In [2]:
import tarfile
import zipfile
import urllib
import pickle
from os.path import isfile, isdir
from tqdm import tqdm_notebook as tqdm
import csv
import os

import math
import random
import re
from itertools import chain
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize

from nltk.corpus import wordnet

import gensim

In [6]:
from keras.regularizers import l2
from keras.callbacks import *
# from visualizer import *
from keras.models import *
from keras.optimizers import *
#from keras.utils.np_utils import to_categorical, accuracy
from keras.layers.core import *
from keras.layers import Input, Embedding, LSTM, Dense, merge, TimeDistributed
import logging
from datetime import datetime
from keras import layers
import theano

# Model related functions

## Helper functions

In [7]:
def get_X_i(i):  # get element i
    return X[:,i,:]

def get_Y(X):
    return X[:, :20, :]  # get first xmaxlen elem from time dim

def get_H(X):
    return X[:, 20:, :] # get elements L+1 to N

def get_H_n(X):
    ans = X[:, -1, :]  # get last element from time dim
    return ans

def get_R(X):
    Y, alpha = X[0], X[1]
    ans = K.batch_dot(Y, alpha)
    return ans


In [8]:
def build_model_attention(sentence_length=20, verbose=False):
    """
    https://arxiv.org/abs/1509.06664
    Reasoning about Entailment with Neural Attention
    Tim Rocktäschel, Edward Grefenstette, Karl Moritz Hermann, Tomáš Kočiský, Phil Blunsom
    Submitted on 22 Sep 2015 (v1), last revised 1 Mar 2016
    
    """

    inputs = Input(shape=(sentence_length*2+1, 300), name='inputs')
    #x = Embedding(output_dim=opts.emb, input_dim=opts.max_features, input_length=N, name='x')(main_input)
    drop_out = Dropout(0.1, name='dropout')(inputs)
    
    lstm_fwd = LSTM(150, return_sequences=True, name='lstm_fwd')(drop_out)
    lstm_bwd = LSTM(150, return_sequences=True, go_backwards=True, name='lstm_bwd')(drop_out)
    
    bilstm = layers.concatenate([lstm_fwd, lstm_bwd], name='bilstm')
    drop_out = Dropout(0.1)(bilstm)
    
    ###################################################
    # Attention 
    # M = tanh(Wy*Y + (Wh*hN)*eL)
    Y = Lambda(get_Y, name="Y", output_shape=(sentence_length, 300))(drop_out)
    W_Y = TimeDistributed(Dense(300, W_regularizer=l2(0.01)), name="weight_Y")(Y)
    
    hN = Lambda(get_H_n, output_shape=(300,), name="hN")(drop_out)   
    W_hN = Dense(300, W_regularizer=l2(0.01), name="W_hN")(hN)
    W_hN_eL = RepeatVector(sentence_length, name="W_hN_eL")(W_hN)   
    
    merged = layers.add([W_Y, W_hN_eL], name="merged")
    M = Activation('tanh', name="M")(merged)
    
    # alpha = softmax(wT * Mt)
    alpha_ = TimeDistributed(Dense(1, activation='linear'), name="alpha_")(M)
    flat_alpha = Flatten(name="flat_alpha")(alpha_)
    alpha = Dense(sentence_length, activation='softmax', name="alpha")(flat_alpha)
    
    # r = Y*alphaT
    Y_trans = Permute((2, 1), name="Y_trans")(Y)  # of shape (None,300,20)
    r_ = merge([Y_trans, alpha], output_shape=(300, 1), name="r_", mode=get_R)
    r = Reshape((300,), name="r")(r_)
    
    # h* = tanh (Wp*r + Wx * hN)
    W_r = Dense(300, W_regularizer=l2(0.01))(r)
    W_hN = Dense(300, W_regularizer=l2(0.01))(hN)
    merged = layers.add([W_r, W_hN])    
    h_star = Activation('tanh')(merged)    
    
    ####################################
    #Output
    output  = Dense(3, activation='softmax')(h_star)
    model = Model(input=[inputs], output=output)
    
    if verbose:
        model.summary()
    # plot(model, 'model.png')
    # # model.compile(loss={'output':'binary_crossentropy'}, optimizer=Adam())
    # model.compile(loss={'output':'categorical_crossentropy'}, optimizer=Adam(options.lr))
    model.compile(loss='categorical_crossentropy',optimizer=Adam(options.lr))
    return model

# Data related functions

## helper functions

In [9]:
def clean_question(question):
    """remove non-letters, return a list of words"""
    if type(question) != float:
        question =  re.sub('[^a-zA-Z0-9 -]', " ", question)
        return question.lower().split()
    else:
        return ""

def get_synonyms(word):
    #synonyms = wordnet.synsets(word)
    #return list(set(chain.from_iterable([word.lemma_names() for word in synonyms])))
    synonyms = []
    for word in w2v.wv.most_similar(word):
        synonyms.append(word[0])    
    return synonyms

def augment_question(question, k=0.2):
    if not question:
        return question
    new_question = []
    words_to_change = np.random.choice(question, int(k*len(question)))
    for idx, word in enumerate(question):
        if word in words_to_change:
            synonyms = get_synonyms(word)
            if synonyms != []:
                synonym = np.random.choice(synonyms)
                new_question.append(synonym)
            else:
                new_question.append(word)
        else:
            new_question.append(word)
    return new_question

def question_to_vectors_glove(question, length = 20):
    vectors = np.zeros((length,300),dtype = "float32")    
    number_of_words = 0
    
    for idx, word in enumerate(question):        
        if word in glove_words:            
            word_idx = glove_words[word]
            vector = glove_vectors[word_idx]
            vectors[number_of_words] = vector
            number_of_words += 1
            
        if number_of_words == length:
            break          
        
    return vectors

In [10]:
def load_embedding_models():
    with open("data/glove", 'rb') as f:
        glove = pickle.load(f)
    glove_words = glove[0]
    glove_vectors = normalize(glove[1])
    
    _fname = "data/GoogleNews-vectors-negative300.bin"
    w2v = gensim.models.Word2Vec.load_word2vec_format(_fname, binary=True)
    return glove_words, glove_vectors, w2v

In [11]:
def load_original_data():
    train_full = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    train_full_y = train_full['is_duplicate']
    train_full_q1 = train_full['question1']
    train_full_q2 = train_full['question2']
    test_q1 = test['question1']
    test_q2 = test['question2']
    train_q1, cv_q1, train_q2, cv_q2, train_y, cv_y = train_test_split(train_full_q1, train_full_q2, train_full_y, test_size = 0.2, random_state=3)
    train_q1, train_q2, train_y = shuffle(train_q1, train_q2, train_y, random_state=3)
    cv_q1, cv_q2, cv_y = shuffle(cv_q1, cv_q2, cv_y, random_state=3)
    return train_q1, train_q2, train_y, cv_q1, cv_q2, cv_y, test_q1, test_q2

In [12]:
def data_generator_glove(questions1, questions2, y=None, aug_prob=0, batch_size=1024):
    
    sample_size = len(questions1)
    batch_slices = [slice(i, i + batch_size) for i in range(0, sample_size, batch_size)]
    
    while True:
        for batch in batch_slices:
            batch_vectors1 = []
            batch_vectors2 = []
            batch_y = y[batch]

            for question1 in questions1[batch]:
                prob = np.random.rand()                
                question1_new = clean_question(question1)
                
                if prob < aug_prob: 
                    question1_new = augment_question(question1_new)                   

                vector = question_to_vectors_glove(question1_new)
                batch_vectors1.append(vector)
            
            for question2 in questions2[batch]:
                prob = np.random.rand()                
                question2_new = clean_question(question2)
                
                if prob < aug_prob: 
                    question2_new = augment_question(question2_cleaned)                   

                vector = question_to_vectors_glove(question2_new)
                batch_vectors2.append(vector)         

            if y is None:
                # test batch without labels
                yield [np.array(batch_vectors1), np.array(batch_vectors2)]
            else:
                yield [np.array(batch_vectors1), np.array(batch_vectors2)], np.array(batch_y)

# train model

## helper functions

In [13]:
train_q1, train_q2, train_y, cv_q1, cv_q2, cv_y, test_q1, test_q2 = load_original_data()

In [14]:
glove_words, glove_vectors, w2v = load_embedding_models()

In [15]:
def train_model():
    model = build_model_attention()
    
    earlyStopping = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=2, mode='auto')
    tensorBoard = keras.callbacks.TensorBoard(log_dir='/home/jupyter/jupyter/log')
    
    model.fit_generator(data_generator_glove(train_q1, train_q2, train_y, aug_prob=0.0, batch_size=batch_size),
                        steps_per_epoch=len(train_y) // batch_size, 
                        epochs=50, 
                        validation_data=data_generator_glove(cv_q1, cv_q2, cv_y, batch_size=batch_size),
                        validation_steps = len(cv_y) // batch_size, 
                        #verbose=2,
                        callbacks=[tensorBoard, earlyStopping])
    return model

In [16]:
model_attention = train_model()

  name=name)


ValueError: axes to sum over must not contain the batch axis (axes[1]=[0])

In [None]:
model_attention.save("model/model_attention.h5")

In [None]:
def predict(model):
    batch_size = 10240
    return model.predict_generator(data_generator_glove(test_q1, test_q2, batch_size=batch_size),
                                   steps=len(test_q1) // batch_size + 1,
                                   verbose=1)

In [None]:
y_pred = predict(model_attention)

In [None]:
from quora import util
sub_fname = util.get_submission_filename()
submission = util.make_submission(test['test_id'], y_pred)
util.save_submission(sub_fname, submission)