In [1]:
import keras
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Activation, Embedding, LSTM, RepeatVector, Lambda
from keras.layers.merge import Concatenate
from keras.preprocessing.sequence import pad_sequences

Using Theano backend.


In [2]:
import os.path
import pickle
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)  
from tqdm import tqdm
import re
import zipfile
import gzip



In [3]:
MAX_QUESTION_LENGTH = 250
QUESTION_ENCODING_DIM = 100
GLOVE_EMBEDDING_DIM = 300
GLOVE_ARCHIVE = "glove.42B.300d.zip"
GLOVE_FILE = "glove.42B.300d.txt"
TOKEN_INDEX_MAP = "token_index_map_from_42B_300d_glove.pkl"
TOKENIZED_CORPUS = "corpus_tokenized_from_42B_300d_glove.txt"
EMBEDDING_MATRIX = "embedding_matrix_from_42B_300d_glove.pkl"
LSTM_WEIGHTS = "lstm_from_42B_300d_glove_100q.h5"

In [4]:
def process_question(raw_question):
    tokens = re.sub(r"[^a-z0-9]", " ", str(raw_question).lower()).split()
    return tokens

In [5]:
if not os.path.isfile("corpus.pkl"):
    corpus = set()
    
    test_df = pd.read_csv("test.csv.zip", sep=",", compression="zip")
    for index, row in test_df.iterrows():
        corpus.add(row["question1"])
        corpus.add(row["question2"])
    del test_df

    train_df = pd.read_csv("train.csv.zip", sep=",", compression="zip")
    for index, row in train_df.iterrows():
        corpus.add(row["question1"])
        corpus.add(row["question2"])
    del train_df
    
    corpus = list(corpus)
    with open("corpus.pkl", 'wb') as f:
        pickle.dump(corpus, f, pickle.HIGHEST_PROTOCOL)        
else:
    with open("corpus.pkl", "rb") as f:
        corpus = pickle.load(f)

In [6]:
if not os.path.isfile(TOKEN_INDEX_MAP):
    current_index = 1
    token_index_map = {}
    
    all_tokens = set()
    for q in corpus:
        processed_q = process_question(q)
        for token in processed_q:
            all_tokens.add(token)

    glove_zip = zipfile.ZipFile(GLOVE_ARCHIVE)
    with glove_zip.open(GLOVE_FILE) as f:
        for line in f:
            token = line.decode("utf-8").split()[0]
            if token in all_tokens:
                if token not in token_index_map:
                    token_index_map[token] = current_index
                    current_index += 1

    with open(TOKEN_INDEX_MAP, 'wb') as f:
        pickle.dump(token_index_map, f, pickle.HIGHEST_PROTOCOL)          
else:
    with open(TOKEN_INDEX_MAP, "rb") as f:
        token_index_map = pickle.load(f)

In [7]:
if not os.path.isfile(TOKENIZED_CORPUS):
    with open(TOKENIZED_CORPUS, 'w') as f:
        for q in tqdm(corpus):
            processed_q = process_question(q)
            tokenized_q = [token_index_map[x] for x in processed_q if x in token_index_map]
#             tokenized_q = pad_sequences([tokenized_q], maxlen=MAX_QUESTION_LENGTH)
#             print (q, processed_q, tokenized_q, " ".join([str(x) for x in tokenized_q]))
            f.write(" ".join([str(x) for x in tokenized_q]) + "\n")


In [8]:
if not os.path.isfile(EMBEDDING_MATRIX):
    embedding_matrix = np.zeros((len(token_index_map) + 1, GLOVE_EMBEDDING_DIM))

    glove_zip = zipfile.ZipFile(GLOVE_ARCHIVE)
    with glove_zip.open(GLOVE_FILE) as f:
        for line in f:
            line_content = line.decode("utf-8").split()
            token = line_content[0]
            token_embedding = [float(x) for x in line_content[1:]]
            if token in token_index_map:
                embedding_matrix[token_index_map[token], :] = token_embedding

    with open(EMBEDDING_MATRIX, 'wb') as f:
        pickle.dump(embedding_matrix, f, pickle.HIGHEST_PROTOCOL)        
else:
    with open(EMBEDDING_MATRIX, "rb") as f:
        embedding_matrix = pickle.load(f)

In [9]:
embedding_layer = Embedding(input_dim=len(token_index_map)+1,
                            output_dim=GLOVE_EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_QUESTION_LENGTH,
                            trainable=False)

In [10]:
left_input = Input(shape=(MAX_QUESTION_LENGTH,))
left_embedded_input = embedding_layer(left_input)
encoded_question = LSTM(units=QUESTION_ENCODING_DIM, return_sequences=False)(left_embedded_input)
padded_encoding = RepeatVector(MAX_QUESTION_LENGTH)(encoded_question)
decoded_question = LSTM(units=GLOVE_EMBEDDING_DIM, return_sequences=True)(padded_encoding)

right_input = Input(shape=(MAX_QUESTION_LENGTH,))
right_embedded_input = embedding_layer(right_input)

merged_embeddings = Concatenate(axis=2)([decoded_question, right_embedded_input])

def calculate_mse_loss(x):
    return keras.metrics.mean_squared_error(x[:, :, :GLOVE_EMBEDDING_DIM], x[:, :, GLOVE_EMBEDDING_DIM:])
mse_loss_layer = Lambda(calculate_mse_loss, output_shape=(1,))(merged_embeddings)

def mse_loss_placeholder(y_true, y_pred):
    return y_pred

lstm_model = Model(inputs=[left_input, right_input], outputs=mse_loss_layer)
lstm_model.compile(loss=mse_loss_placeholder, optimizer="adam")

In [11]:
lstm_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 250)           0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 250)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 250, 300)      30021600                                     
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100)           160400                                       
___________________________________________________________________________________________

In [17]:
def lstm_input_generator():
    while 1:
        f = open(TOKENIZED_CORPUS)
        for line in f:
            tokens = [int(x) for x in line.split()]
            tokens = pad_sequences([tokens], maxlen=MAX_QUESTION_LENGTH)
            yield ({'input_1': tokens, 'input_2': tokens[:, ::-1]}, {'lambda_1': np.ones((1,))})
        f.close()

In [18]:
if not os.path.isfile(LSTM_WEIGHTS):
    lstm_model.fit_generator(lstm_input_generator(),
                             steps_per_epoch=(len(corpus)),
                             verbose=1,
                             epochs=1
                            )
    lstm_model.save_weights(LSTM_WEIGHTS)        
else:
    lstm_model.load_weights(LSTM_WEIGHTS)

Epoch 1/1
     22/4789031 [..............................] - ETA: 5808421s - loss: 0.0113 - ETA: 6146292s - loss: 0.0114 - ETA: 6107152s - loss: 0.0115 - ETA: 6008511s - loss: 0.0106 - ETA: 5957947s - loss: 0.0109 - ETA: 5922658s - loss: 0.0107 - ETA: 5892631s - loss: 0.0101 - ETA: 5876126s - loss: 0.0103 - ETA: 5881372s - loss: 0.0105

KeyboardInterrupt: 

In [21]:
5935186 / (60 * 60 * 24)

68.69428240740741

In [None]:
# pretrained_wv = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
# pretrained_wv = KeyedVectors.load_word2vec_format("glove.6B.100d.w2vformat.txt", binary=False)

In [None]:
# print(sum([token_counts[x] for x in unique_tokens]))
# print(sum([token_counts[x] for x in leftout_tokens]))