In [1]:
'''
Script inspired from - https://www.kaggle.com/lystdo/lb-0-18-lstm-with-glove-and-magic-features

Example of an LSTM model with GloVe embeddings along with magic features

Tested under Keras 2.0 with Tensorflow 1.0 backend

'''

########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import StandardScaler

import sys


Using TensorFlow backend.


In [2]:
########################################
## set directories and parameters
########################################
BASE_DIR = '/home/sidsvash26/kaggle_quora/data/'
EMBEDDING_FILE = BASE_DIR + 'glove.840B.300d.txt'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [3]:
########################################
## index word vectors
########################################
print('Indexing word vectors')

embeddings_index = {}
f = open(EMBEDDING_FILE)
count = 0
for line in f:
    values = line.split(' ') 
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %d word vectors of glove.' % len(embeddings_index))

Indexing word vectors
Found 2196016 word vectors of glove.


In [4]:
########################################
## process texts in datasets
########################################
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    s = text
    s = s.lower()
    s=s.replace("?", " ")
    
    #Seperators        
    s = re.sub(r"([a-zA-Z])\.([a-zA-Z)])", r"\1 \2", s) #sep '.' b/w letters
    #s=re.sub(r"([0-9])([a-zA-Z])", r"\1 \2", s) #sep alpha anumeric
    #s=re.sub(r"([a-zA-Z])([0-9])", r"\1 \2", s)
    #s=re.sub(r"([a-z])([A-Z])", r"\1 \2",s) #sep lowercase and uppercase letter
    
    #substitute more than one consecutive dots to a space
    s = re.sub(r"([.][.]+)", r" ", s)
             
        #removes any " adjacent to an alphabet
    s = re.sub(r"\"([a-zA-Z])", r" \1", s) 
    s = re.sub(r"([a-zA-Z])\"", r"\1 ", s) 
        
        #removes ' from highlighted words/phrase
    s = re.sub(r"( )(')([ a-zA-Z 0-9]+)(')", r" \3 ", s) 
    s = re.sub(r"(')([ a-zA-Z 0-9]+)(')( )", r" \2 ", s) 
        
    #removes any , adjacent to an alphabet
    s = re.sub(r"\,([a-zA-Z])", r" \1", s) 
    s = re.sub(r"([a-zA-Z])\,", r"\1 ", s) 
        
    s = re.sub(r"([a-zA-Z])\/([a-zA-Z])", r"\1 \2", s)  #sep '/' b/w letters
    
    #sep '/' b/w letters and number
    s = re.sub(r"([0-9])\/([a-zA-Z])", r"\1 \2", s)
    s = re.sub(r"([a-zA-Z])\/([0-9])", r"\1 \2", s)
              
    s=re.sub(r"([a-zA-Z])(\.) ", r"\1 ", s) #removes dot after any alphabet
    s=re.sub(r"([0-9])\,([0-9])", r"\1\2", s) #removing commas in b/w numbers
    s=re.sub(r"[()]", r" ", s) #removes open and close brackets
    
    #replacements
    s=s.replace("-", " ")
    s=s.replace("*", " ")
    s=s.replace("#", " ")
    s=s.replace(";", " ")
    s=s.replace("$", " ")
    s=s.replace("%", " ")
    s=s.replace("?", " ")
    s=s.replace(":", " ")
    s=s.replace("[math]", " [math] ")
    s=s.replace("[/math]", " [/math] " )

    #last substitution to save space
    s=re.sub(r"[  ]+", r" ", s) #substitutes double space to single
    
    # Clean the text
    s = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", s)
    s = re.sub(r"what's", "what is ", s)
    s = re.sub(r"\'s", " ", s)
    s = re.sub(r"\'ve", " have ", s)
    s = re.sub(r"can't", "cannot ", s)
    s = re.sub(r"n't", " not ", s)
    s = re.sub(r"i'm", "i am ", s)
    s = re.sub(r"\'re", " are ", s)
    s = re.sub(r"\'d", " would ", s)
    s = re.sub(r"\'ll", " will ", s)
    
    s = re.sub(r"(\d+)(k)", r"\g<1>000", s)
    s = re.sub(r" e g ", " eg ", s)
    s = re.sub(r" b g ", " bg ", s)
    s = re.sub(r" u s ", " american ", s)
    s = re.sub(r"\0s", "0", s)
    s = re.sub(r" 9 11 ", "911", s)
    s = re.sub(r"e - mail", "email", s)
    s = re.sub(r"j k", "jk", s)
    
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = s.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

Processing text dataset


In [5]:
texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_1))

test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

Found 404290 texts in train.csv
Found 2345796 texts in test.csv


In [6]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

Found 137042 unique tokens
Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)


In [7]:
########################################
## generate leaky features
########################################
import pandas as pd
import numpy as np
import pickle

train_magics = pickle.load(open(BASE_DIR + 'feats9_all_magic.sav', 'rb'))
train_magic_feats = pd.DataFrame(data=train_magics[:,:], columns=['q1q2_intersect', 'q1q2_jaccard','q1_freq', 'q2_freq', 'q1_pagerank','q2_pagerank','q1_pagerank_log','q2_pagerank_log'])

test_magics = pickle.load(open(BASE_DIR + 'feats9_all_magic_for_test.sav', 'rb'))
test_magic_feats = pd.DataFrame(data=test_magics[:,:], columns=['q1q2_intersect', 'q1q2_jaccard','q1_freq', 'q2_freq', 'q1_pagerank','q2_pagerank','q1_pagerank_log','q2_pagerank_log'])


In [23]:
#K core leaky features
train_kcores = pickle.load(open(BASE_DIR + 'feats8_kcore_v1.sav', 'rb'))
train_kcore_feats = pd.DataFrame(data=train_kcores[:,:], columns=['q1_kcore', 'q2_kcore','max_kcore']) 


test_kcores = pickle.load(open(BASE_DIR + 'feats8_kcore_v1_for_test.sav', 'rb'))
test_kcore_feats = pd.DataFrame(data=test_kcores[:,:], columns=['q1_kcore', 'q2_kcore','max_kcore'])

#manual features
train_manual = pickle.load(open(BASE_DIR + 'feats2_match_train.sav', 'rb'))
train_manual_feats = pd.DataFrame(data=train_manual[:,:])

test_manual = pickle.load(open(BASE_DIR + 'feats2_match_test.sav', 'rb'))
test_manual_feats = pd.DataFrame(data=test_manual[:,:])

#manual feats others -- training
#train_x1 = pickle.load(open(BASE_DIR + 'feats1_tfidf_train.sav', 'rb'))
#train_x1_feats = pd.DataFrame(data=train_x1[:,:])
#train_x1_feats.fillna(value=0, inplace=True)

#train_x2 = pickle.load(open(BASE_DIR + 'feats3_glove_train.sav', 'rb'))
#train_x2_feats = pd.DataFrame(data=train_x2[:,:])

#train_x3 = pickle.load(open(BASE_DIR + 'feats4_word2vec.sav', 'rb'))
#train_x3_feats = pd.DataFrame(data=train_x3[:,:])

#train_x4 = pickle.load(open(BASE_DIR + 'feats6_whq_jaccard.sav', 'rb'))
#train_x4_feats = pd.DataFrame(data=train_x4[:,:])

train_x5 = pickle.load(open(BASE_DIR + 'feats10_locations.sav', 'rb'))
train_x5_feats = pd.DataFrame(data=train_x5[:,:])

#manual feats others -- test data
#test_x1 = pickle.load(open(BASE_DIR + 'feats1_tfidf_test.sav', 'rb'))
#test_x1_feats = pd.DataFrame(data=test_x1[:,:])
#train_x1_feats.fillna(value=0, inplace=True)

#test_x2 = pickle.load(open(BASE_DIR + 'feats3_glove_test.sav', 'rb'))
#test_x2_feats = pd.DataFrame(data=test_x2[:,:])

#test_x3 = pickle.load(open(BASE_DIR + 'feats4_word2vec_test.sav', 'rb'))
#test_x3_feats = pd.DataFrame(data=test_x3[:,:])

#test_x4 = pickle.load(open(BASE_DIR + 'feats6_whq_jaccard_for_test.sav', 'rb'))
#test_x4_feats = pd.DataFrame(data=test_x4[:,:])

test_x5 = pickle.load(open(BASE_DIR + 'feats10_locations_for_test.sav', 'rb'))
test_x5_feats = pd.DataFrame(data=test_x5[:,:])


leaks =  pd.concat((train_magic_feats,train_kcore_feats, train_manual_feats,train_x5_feats), axis=1)

test_leaks = pd.concat((test_magic_feats,test_kcore_feats,test_manual_feats, test_x5_feats), axis=1)



In [24]:
ss = StandardScaler()
ss.fit(np.vstack((leaks, test_leaks)))
leaks = ss.transform(leaks)
test_leaks = ss.transform(test_leaks)

In [25]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


Preparing embedding matrix
Null word embeddings: 50301


In [26]:
########################################
## sample train/validation data
########################################
#np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
leaks_train = np.vstack((leaks[idx_train], leaks[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

In [28]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

leaks_input = Input(shape=(leaks.shape[1],))
leaks_dense = Dense(int(num_dense/2), activation=act)(leaks_input)

merged = concatenate([x1, y1, leaks_dense])
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

preds = Dense(1, activation='sigmoid')(merged)

########################################
## add class weight
########################################
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

In [29]:
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

lstm_196_117_0.19_0.24
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200


In [30]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

preds = model.predict([test_data_1, test_data_2, test_leaks], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1, test_leaks], batch_size=8192, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'_magic_manual.csv', index=False)

Start making the submission before fine-tuning
