In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [2]:
from collections import Counter

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [30]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

ENG_STOP_WORDS = set(stopwords.words("english"))
GLOVE_DIR = "../input/glove-global-vectors-for-word-representation/"

def build_corpus(df, size):
    """
    build unique set of questions given question pairs in dataframe
    """
    print("building unique question list")
    questions = np.empty(size, dtype='O')
    
    for i, row in df.fillna("").iterrows():
        #if not questions[row.qid1 - 1]:
        questions[row.qid1 - 1] = row.question1
        #if not questions[row.qid2 - 1]:
        questions[row.qid2 - 1] = row.question2

    return questions

def sentence2words(sentences):
    return [ [ word for word in word_tokenize(sentence.lower())
                  if word not in ENG_STOP_WORDS 
             ]
                    for sentence in sentences]

def word_indexer(index_to_word):
    return { word: i  for i, word in enumerate(index_to_word)}

def build_vocab(tokenized_sentences, size=None):
    """
    Given the words, build word <=> index mapping.
    """
    word_counter = Counter()
    for i, line in enumerate(tokenized_sentences):
        word_counter.update(map(str.lower, line))

    if size is None:
        size = len(word_counter)
    
    print("vocab size: {}".format(size))

    # index => word
    index_to_word = ["<UNK>",] + list( map( lambda x: x[0], word_counter.most_common(size)))
    # word => index
    word_to_index = word_indexer(index_to_word)
    return word_to_index, index_to_word

def indexer(tokenized_sentences, word_to_index):
    """Convert list of words to list of word indices."""
    return [[word_to_index.get(word, 0) 
                 for word in sentence] 
                    for sentence in tokenized_sentences]

def build_xy(questions, df, indices):
    mdf = df.loc[indices]
    # X => question pair
    # Y => is_dup?
    return ([ questions[mdf.qid1 - 1],
              questions[mdf.qid2 - 1] ],
            mdf.is_duplicate )

def load_train_dataset(filepath, vocab_size=None):
    """
    Load the training data. 
    input: train csv containing questions and labels
    output: each question (sequence of words) is converted to 
        sequence of indices, word to index mapping, index to word mapping
    """
    df = pd.read_csv(filepath, index_col="id")
    n_questions = df[["qid1", "qid2"]].max().values.max()
    print("Total Questions: {}".format(n_questions))
    
    questions = build_corpus(df, n_questions)
    # question: "Where is India?"
    # tokenized: ["where", "india", "?"]
    # indexed: [450, 220, 1]
    tokenized_questions = sentence2words(questions)
    del questions
    
    word_to_index, index_to_word = build_vocab(tokenized_questions, vocab_size)    
        
    indexed_questions = indexer(tokenized_questions, word_to_index)
    del tokenized_questions
    
    max_len = max([len(seq) for seq in indexed_questions])
    print("Max. sequence length = {}".format(max_len))
    
    padded_questions = pad_sequences(indexed_questions, max_len)
    del indexed_questions
    
    train_indices, test_indices = train_test_split(range(df.shape[0]),
                                                   test_size=40000,
                                                   random_state=1421)
    
    X_train, y_train = build_xy(padded_questions, df, train_indices)
    X_val, y_val = build_xy(padded_questions, df, test_indices)
    return (X_train, y_train),  (X_val, y_val), index_to_word, word_to_index


In [31]:
(X_train, y_train), (X_val, y_val), words, word_to_index = load_train_dataset("../input/quora-question-pairs/train.csv")

In [32]:
X_train[1].shape

In [33]:
def load_glove(embedding_dim):
    embeddings_index = {}    
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.{}d.txt'.format(embedding_dim)))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs        
    f.close()
    return embeddings_index

def emb2matrix(words, embeddings, embedding_dim):
    """create a matrix from the word embedding"""
    matrix = np.zeros((len(words), embedding_dim))
    missing = 0 # word is in vocab but we don't have embedding.
    
    # first word is UNK token. leave the embedding to 0 for UNKNOWN words
    for i, word in enumerate(words[1:], 1):
        vec = embeddings.get(word)
        if vec is not None:
            missing += 1
            matrix[i, :] = vec
        # FIXME if the word in vocab is not in word embedding, we are setting it to 0 (same as UNK), try random init.
    print("Embedding Matrix: {}".format(matrix.shape))
    print("Words missings in embedding: {}".format(missing))
    return matrix


In [34]:
EMBEDDING_DIM = 100

word_embeddings = load_glove(EMBEDDING_DIM)
embedding_matrix = emb2matrix(words, word_embeddings, EMBEDDING_DIM)

In [35]:
print("Duplicate : {:.2%}".format(y_train.sum()/y_train.shape[0]))

In [36]:
MAX_SEQ_LEN = X_train[0].shape[1]
VOCAB_SIZE = len(words)

In [37]:
from keras.layers import Input, Dense, Embedding, Dropout, Activation
from keras.layers import Concatenate, Subtract, Multiply, Dot
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Sequential, Model

def encoding_layer():
    filters = 250
    kernel_size = 3
    
    model = Sequential(name="encoding")
    model.add( Embedding(VOCAB_SIZE, # already includes UNK
                         EMBEDDING_DIM,
                         weights=[embedding_matrix],
                         input_length=MAX_SEQ_LEN,
                         trainable=False) )
    model.add(Conv1D(filters, kernel_size, padding="valid", activation="relu", strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128))
    model.add(Dropout(0.2))
    model.add(Activation("relu"))
    return model

def merge(towers):
    t_diff = Subtract()(towers)
    t_mul = Multiply()(towers)
    return Concatenate()([*towers, t_diff, t_mul])
    #return Dot(1, normalize=True)(towers)

def build_model():
    encoder = encoding_layer()
    encoder.summary()
    
    q1 = Input(shape=(MAX_SEQ_LEN,), dtype='int32')
    q2 = Input(shape=(MAX_SEQ_LEN,), dtype='int32')
    
    t1 = encoder(q1)
    t2 = encoder(q2)
    
    x = merge([t1, t2])
    x = Dropout(0.5)(x)
    x = Activation("relu")(x)
    x = Dense(64)(x)
    x = Dropout(0.2)(x)
    x = Activation("relu")(x)    
    x = Dense(1)(x)
    out = Activation("sigmoid")(x)
    
    model = Model([q1, q2], out)
    model.summary()
    return model

In [38]:
model = build_model()

In [50]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=1024, epochs=5)

In [64]:
model.save("01-cnn-model.h5")

In [None]:
from keras.utils import Sequence
from io import StringIO
import linecache

# using a data generSequential for 
class DataGenerator(Sequence):
    def __init__(self, filepath, size, word_to_index, max_seq_len, batch_size=32):
        self.filepath = filepath
        self.size = size
        self.word_to_index = word_to_index
        self.max_seq_len = max_seq_len
        
        self.batch_size = batch_size
        self.indexes = np.arange(size)

    def __len__(self):
        return (self.size // self.batch_size) + int((self.size % self.batch_size) > 0)
    
    def __getitem__(self, index):
        """Generate one batch."""
        buffer = StringIO()
        for test_id in range(index * self.batch_size, (index + 1) * batch_size):
            line = linecache.getline(self.filepath, test_id + 1 + 1 ) # since first line is header.
            buffer.write(line)
        
        buffer.seek(0)
        df = pd.read_csv(buffer, names=["test_id", "question1", "question2"])
        #print(df.shape)
        X1 = preprocess(df.question1.fillna(""), self.word_to_index, self.max_seq_len)
        X2 = preprocess(df.question2.fillna(""), self.word_to_index, self.max_seq_len)
        return [X1, X2]


In [None]:
def preprocess(questions, word_to_index, max_len):
    return pad_sequences( indexer(sentence2words(questions), word_to_index), max_len)

# def load_test_dataset(filepath, word_to_index, max_seq_len):
#     df = pd.read_csv(filepath, index_col="test_id", nrows=10000)
#     X1 = np.vstack( df.question1.fillna("").apply(lambda x: preprocess([x,], word_to_index, max_seq_len)) )
#     X2 = np.vstack( df.question2.fillna("").apply(lambda x: preprocess([x,], word_to_index, max_seq_len)) )
#     return [X1, X2]

In [None]:
batch_size = 32
# word_to_index = word_indexer(words)
test_data_generator = DataGenerator("../input/quora-question-pairs/test.csv", 
                                    2345805, 
                                    word_to_index, 
                                    MAX_SEQ_LEN, 
                                    batch_size)

nrows = 10000
steps = (nrows // batch_size) + int( (nrows % batch_size) > 0 )
#steps = None

predictions = np.squeeze( model.predict_generator(test_data_generator, steps, workers=6, use_multiprocessing=True, verbose=1) )
#is_duplicate = (predictions > 0.5).astype("int32")
#print(is_duplicate.shape)
#print(is_duplicate.sum() / is_duplicate.shape[0])
print(predictions.sum()/ len(predictions))

In [None]:
import csv
import gzip

with gzip.open("submission-v3.csv.gz", "wt") as fp:
    writer = csv.writer(fp)
    writer.writerow(["test_id", "is_duplicate"])
    writer.writerows(enumerate(predictions))
