# Keras Test Submission

In [34]:
import os
import sys
import codecs
import pandas as pd
import numpy as np
import csv
import gc

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import layers
from keras.models import Model
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras import backend as K

from keras.layers import Dense, Input, Flatten, merge, LSTM, Lambda, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding


np.random.seed(42)

In [2]:
BASE_DIR = './data/'
GLOVE_DIR = './glove_w2v/'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.01

In [5]:
embds_index = {}
with codecs.open(os.path.join(GLOVE_DIR, 'glove.6b.300d.txt'), encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word, coefs = values[0], np.array(values[1:], dtype='float32')
        embds_index[word] = coefs
print('Vecrods: {}'.format(len(embds_index)))

Vecrods: 400000


In [27]:
def load_texts(filename, text1_ind, text2_ind, label_ind):
    texts1, texts2, labels = [], [], []
    with codecs.open(filename, encoding='utf-8') as f:
        reader = csv.reader(f,  delimiter=',') #as reader:
        header = next(reader)
        for values in reader:
            texts1.append(values[text1_ind])
            texts2.append(values[text2_ind])
            labels.append(int(values[label_ind]))
    return texts1, texts2, labels

In [28]:
print('Processing text')
texts1, texts2, labels = load_texts(TRAIN_DATA_FILE,
                                    text1_ind=2, 
                                    text2_ind=3, 
                                    label_ind=5)
print('Texts: {}'.format(len(texts1)))

Processing text
Texts: 404290


In [29]:
print('Processing text')
test_texts1, test_texts2, test_labels = load_texts(TEST_DATA_FILE,
                                                   text1_ind=1,
                                                   text2_ind=2,
                                                   label_ind=0)
print('Test Texts: {}'.format(len(test_texts1)))

Processing text
Test Texts: 2345796


In [37]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts1 + texts2 + test_texts1 + test_texts2)
sequences_1 = tokenizer.texts_to_sequences(texts1)
sequences_2 = tokenizer.texts_to_sequences(texts2)
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))

Found 421364 unique tokens.


In [38]:
test_sequences1 = tokenizer.texts_to_sequences(test_texts1)
test_sequences2 = tokenizer.texts_to_sequences(test_texts2)

data1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor: {}'.format(data1.shape))
print('Shape of label tensor: {}'.format(labels.shape))

test_data_1 = pad_sequences(test_sequences1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences2, maxlen=MAX_SEQUENCE_LENGTH)
test_labels = np.array(test_labels)
del test_sequences1
del test_sequences2
del sequences_1
del sequences_2
gc.collect()

Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)


0

In [41]:
print('Preparing embedding matrix.')
nb_words = min(MAX_NB_WORDS, len(word_index))

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embds_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: {}'.format(np.sum(
            np.sum(embedding_matrix, axis=1) == 0)))

Preparing embedding matrix.
Null word embeddings: 133734


## Building network

In [42]:
embedding_layer = Embedding(nb_words,
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           input_length=MAX_SEQUENCE_LENGTH,
                           trainable=False)

In [47]:
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences1 = embedding_layer(sequence_1_input)
x1 = Conv1D(128, 3, activation='relu')(embedded_sequences1)
x1 = MaxPooling1D(10)(x1)
x1 = Flatten()(x1)
x1 = Dense(64, activation='relu')(x1)
x1 = Dropout(0.2)(x1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences2 = embedding_layer(sequence_2_input)
y1 = Conv1D(128, 3, activation='relu')(embedded_sequences2)
y1 = MaxPooling1D(10)(y1)
y1 = Flatten()(y1)
y1 = Dense(64, activation='relu')(y1)
y1 = Dropout(0.2)(y1)

merged = merge([x1, y1], mode='concat')
merged = BatchNormalization()(merged)
merged = Dense(64, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)
model = Model(input=[sequence_1_input, sequence_2_input], output=preds)

  '` call to the Keras 2 API: ' + signature)


In [48]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

## Model Training

In [52]:
model.fit([data1, data2], labels, validation_split=VALIDATION_SPLIT,
         epochs=10, batch_size=1024, shuffle=True)

Train on 400247 samples, validate on 4043 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x8545483e10>

In [53]:
preds = model.predict([test_data_1, test_data_2])
print(preds.shape)

(2345796, 1)


In [54]:
out_df = pd.DataFrame({"test_id":test_labels, "is_duplicate":preds.ravel()})
out_df.to_csv("test_predictions_10.csv", index=False)