In [1]:
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import check_output

%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls


from tqdm import tqdm
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import TimeDistributed, Lambda
from keras.layers.convolutional import Conv1D
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text
from keras.utils import to_categorical
from keras.layers.pooling import MaxPooling1D,GlobalMaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
BASE_DIR = '/Users/ppujari/udacity'
GLOVE_DIR = BASE_DIR + '/word_embeddings/glove.6B/'
TRAIN_DATA_FILE = BASE_DIR + '/kaggle/quora/data/train.csv'
TEST_DATA_FILE = BASE_DIR + '/kaggle/quora/data/test.csv'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

In [3]:
print(check_output(["ls", "./data"]).decode("utf8"))

df = pd.read_csv(TRAIN_DATA_FILE).fillna("0")
labels = df.is_duplicate.values
pd.set_option('max_colwidth', 800)
df.head(10)

GoogleNews-vectors-negative300.bin.gz
glove.6B.100d.txt
glove.6B.200d.txt
glove.6B.300d.txt
glove.6B.50d.txt
glove.6B.zip
glove.840B.300d.txt
glove.840B.300d.zip
quora_duplicate_questions.tsv
test.csv
test.csv.zip
train.csv
train.csv.zip



Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?,"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone and video games?,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Motorolla DCX3400?,How do I hack Motorola DCX3400 for free internet?,0


In [4]:
print('Processing test dataset')
#Read test data
test_df = pd.read_csv(TEST_DATA_FILE)
test_df.head(10)

test_tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)

#List of text to train 
test_tokenizer.fit_on_texts(list(test_df.question1.values.astype(str)) + list(test_df.question2.values.astype(str)))

#list of texts to turn to sequences
test_sequences_question_1  = test_tokenizer.texts_to_sequences(test_df.question1.values.astype(str))
test_sequences_question_1 = sequence.pad_sequences(test_sequences_question_1, maxlen=MAX_SEQUENCE_LENGTH)

test_sequences_question_2  = test_tokenizer.texts_to_sequences(test_df.question2.values.astype(str))
test_sequences_question_2 = sequence.pad_sequences(test_sequences_question_2, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor (Question 1):', test_sequences_question_1.shape)
print('Shape of data tensor (Question 2):', test_sequences_question_2.shape)



Processing test dataset
Shape of data tensor (Question 1): (2345796, 1000)
Shape of data tensor (Question 2): (2345796, 1000)


So we have six columns in total one of which is the label.

np_utils.to_categorical(y, num_classes=None)

Arguments
    y: class vector to be converted into a matrix (integers from 0 to num_classes).
    num_classes: total number of classes.
Returns
    A binary matrix representation of the input.

In [6]:
#prepare text samples and their labels. finally, vectorize the text samples into a 2D integer tensor

print('Processing dataset for training')

text_tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)

#List of text to train 
text_tokenizer.fit_on_texts(list(df.question1.values) + list(df.question2.values.astype(str)))

#list of texts to turn to sequences
list_of_sequences_question_1  = text_tokenizer.texts_to_sequences(df.question1.values)
list_of_sequences_question_1 = sequence.pad_sequences(list_of_sequences_question_1, maxlen=MAX_SEQUENCE_LENGTH)

list_of_sequences_question_2 = text_tokenizer.texts_to_sequences(df.question2.values.astype(str))
list_of_sequences_question_2 = sequence.pad_sequences(list_of_sequences_question_2, maxlen=MAX_SEQUENCE_LENGTH)

word_index = text_tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor (Question 1):', list_of_sequences_question_1.shape)
print('Shape of data tensor (Question 2):', list_of_sequences_question_2.shape)
print('Shape of label tensor:', labels.shape)


Processing train text dataset
Found 95596 unique tokens.
Shape of data tensor (Question 1): (404290, 1000)
Shape of data tensor (Question 2): (404290, 1000)
Shape of label tensor: (808580, 2)


In [29]:
# split the data into a training set and a validation set
#indices = np.arange(data.shape[0])
#np.random.shuffle(indices)
#data = data[indices]
#labels = labels[indices]
#num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

#x_train = data[:-num_validation_samples]
#y_train = labels[:-num_validation_samples]
#x_val = data[-num_validation_samples:]
#y_val = labels[-num_validation_samples:]
########################################
## sample train/validation data
########################################
#np.random.seed(1234)
perm = np.random.permutation(len(list_of_sequences_question_1))
idx_train = perm[:int(len(list_of_sequences_question_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(list_of_sequences_question_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((list_of_sequences_question_1[idx_train], list_of_sequences_question_2[idx_train]))
data_2_train = np.vstack((list_of_sequences_question_2[idx_train], list_of_sequences_question_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((list_of_sequences_question_1[idx_val], list_of_sequences_question_2[idx_val]))
data_2_val = np.vstack((list_of_sequences_question_2[idx_val], list_of_sequences_question_1[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))
weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
#    weight_val[labels_val==0] = 1.309028344
print(labels_train.shape)
print('sample train/validation data')

(646864, 2)
sample train/validation data


Preparing the Embedding layer

Next, we compute an index mapping words to known embeddings, by parsing the data dump of pre-trained embeddings

In [30]:
# first, build index mapping words in the embeddings set to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


At this point we can leverage our embedding_index dictionary and our word_index to compute our embedding matrix:

In [45]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
print (num_words)
print (len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Preparing embedding matrix.
20000
95597


In [68]:
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25
act='relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [69]:
#### from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32',  name='main_input')

#Question1
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed

embedding_layer_q1 = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(sequence_1_input)

#embedded_sequences_1 = embedding_layer_q1(sequence_1_input)

#lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
x1 = LSTM(4, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)(embedding_layer_q1)


#x1 = lstm_layer(embedded_sequences_1)

#TimeDistributed(Dense(300, activation='relu'))



In [70]:
#how q2 is embeded? sample text consists all of the data?

#Question2
sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer_q2 = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(sequence_2_input)

#embedded_sequences_2 = embedding_layer_q2(sequence_2_input)
#y1 = lstm_layer(embedded_sequences_2)

y1 = LSTM(4, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)(embedding_layer_q2)



Training a 1D convnet

In [71]:
from keras.layers import concatenate
from keras.models import Model

merged = concatenate([x1, y1])

merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(2, activation='sigmoid')(merged)
########################################
## add class weight
########################################
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None
model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['acc'])

We have a fairly balanced dataset here????

In [None]:
model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

model.fit([data_1_train, data_2_train], [labels_train], batch_size=384, epochs=200,
                 verbose=1, validation_split=0.1, shuffle=True, callbacks=[model_checkpoint])

#hist = model.fit([data_1_train, data_2_train], labels_train, \
#        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
#        epochs=200, batch_size=2048, shuffle=True, \
#        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

preds = model.predict([test_sequences_question_1, test_sequences_question_2], batch_size=8192, verbose=1)
preds += model.predict([test_sequences_question_2, test_sequences_question_1], batch_size=8192, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 1000)          0                                            
____________________________________________________________________________________________________
input_17 (InputLayer)            (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_19 (Embedding)         (None, 1000, 300)     6000000                                      
____________________________________________________________________________________________________
embedding_20 (Embedding)         (None, 1000, 300)     6000000                                      
___________________________________________________________________________________________

Feature construction
We will now construct a basic set of features that we will later use to embed our samples with.
The first we will be looking at is rather standard TF-IDF encoding for each of the questions. In order to limit the computational complexity and storage requirements we will only encode the top terms across all documents with TF-IDF and also look at a subsample of the data.