Source: https://github.com/tim5go/quora-question-pairs

In [5]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
from collections import defaultdict
import numpy as np
import pandas as pd
from imp import reload

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

import sys


Using TensorFlow backend.


In [25]:
########################################
## set directories and parameters
########################################
BASE_DIR = 'C:/Renee/aml_keras/input/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [14]:
!echo %cd%

C:\Renee\aml_keras


In [23]:
########################################
## index word vectors
########################################
print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))


Indexing word vectors
Found 3000000 word vectors of word2vec


In [24]:
result = word2vec.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

  if np.issubdtype(vec.dtype, np.int):


queen: 0.7118


In [26]:
########################################
## process texts in datasets
########################################
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

def try_apply_dict(x,dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0

Processing text dataset


In [29]:
texts_1 = [] 
texts_2 = []
labels = []

#aplly function to clean data
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_1))

train_orig =  pd.read_csv(BASE_DIR + 'train.csv', header=0)

test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

test_orig =  pd.read_csv(BASE_DIR + 'test.csv', header=0)

ques = pd.concat([train_orig[['question1', 'question2']], \
        test_orig[['question1', 'question2']]], axis=0).reset_index(drop='index')
		
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])
		
train_feat = train_orig.apply(q1_q2_intersect, axis=1, raw=True)
test_feat = test_orig.apply(q1_q2_intersect, axis=1, raw=True)

Found 404290 texts in train.csv
Found 81126 texts in test.csv


In [47]:
texts_1[0:10]

['what is the step by step guide to invest in share market in india ',
 'what is the story of kohinoor koh - i - noor diamond ',
 'how can i increase the speed of my internet connection while using a vpn ',
 'why am i mentally very lonely how can i solve it ',
 'which one dissolve in water quikly sugar salt methane and carbon di oxide ',
 'astrology : i am a capricorn sun cap moon and cap rising what does that say about me ',
 'should i buy tiago ',
 'how can i be a good geologist ',
 'when do you use instead of ',
 'motorola company : can i hack my charter motorolla dcx3400 ']

In [117]:
# Kopieer vraag 1 en 2 van test en train naar aparte dataframes
df1 = train_orig[['question1']].copy()
df2 = train_orig[['question2']].copy()
df1_test = test_orig[['question1']].copy()
df2_test = test_orig[['question2']].copy()
#print(df1.head(),df2.head())

# Voeg alle vragen samen in één kolom in één dataframe
df2.rename(columns = {'question2':'question1'},inplace=True)
df2_test.rename(columns = {'question2':'question1'},inplace=True)
#print(df2.head())

train_questions = df1.append(df2)
train_questions = train_questions.append(df1_test)
train_questions = train_questions.append(df2_test)
#print(train_questions.head(), train_questions.shape)

# Dubbele vragen verwijderen
#train_questions.drop_duplicates(subset = ['qid1'],inplace=True)
train_questions.drop_duplicates(subset = ['question1'],inplace=True)
#print(train_questions.head(), train_questions.shape)

# Plaats alle originele vragen in een dictionary
train_questions.reset_index(inplace=True,drop=True)
questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()

# Samenvoegen train en testvragen in df 'comb'
train_cp = train_orig.copy()
test_cp = test_orig.copy()
train_cp.drop(['qid1','qid2'],axis=1,inplace=True)
#print(train_cp[0:15])

test_cp['is_duplicate'] = -1
test_cp.rename(columns={'test_id':'id'},inplace=True)
#print(test_cp[0:15])

comb = pd.concat([train_cp,test_cp])
#print(comb.head(), comb.shape)

# Voeg de nr's van de vragen toe uit de dictionary met unieke vragen
comb['q1_hash'] = comb['question1'].map(questions_dict)
comb['q2_hash'] = comb['question2'].map(questions_dict)
#print(comb.head(), comb.shape)

# Maak een dictionary met hoe vaak elke vraag voorkomt
q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()
#print(q1_vc[1])

# Voeg toe aan dataframe hoe vaak de betreffende vraag voorkomt
comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
#print(comb.head())

# Train en testdata weer uit elkaar halen en vragen in tekst verwijderen als kolom
train_comb = comb[comb['is_duplicate'] >= 0][['id','q1_hash','q2_hash','q1_freq','q2_freq','is_duplicate']]
test_comb = comb[comb['is_duplicate'] < 0][['id','q1_hash','q2_hash','q1_freq','q2_freq']]
#print(comb[0:15],train_comb[0:15])

# Frequencies voorkomen vragen los opslaan
train_q1_freq = train_comb['q1_freq']
train_q2_freq = train_comb['q2_freq']
test_q1_freq = test_comb['q1_freq']
test_q2_freq = test_comb['q2_freq']
#print(train_q2_freq[0:10])

In [68]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

# The Tokenizer stores everything in the word_index during fit_on_texts
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

# When calling the texts_to_sequences method, only the top num_words are considered
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

# Apply padding
data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

Found 85518 unique tokens
Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)


In [69]:
print(
texts_1[0:1],
texts_2[0:1],
sequences_1[0:1],
sequences_2[0:1],
data_1[0:1],
data_2[0:1],
labels[0:1]
)

['what is the step by step guide to invest in share market in india '] ['what is the step by step guide to invest in share market '] [[2, 3, 1, 1219, 58, 1219, 2613, 7, 570, 8, 758, 376, 8, 36]] [[2, 3, 1, 1219, 58, 1219, 2613, 7, 570, 8, 758, 376]] [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    2    3    1 1219   58 1219 2613    7  570    8  758  376
     8   36]] [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    2    3    1 1219   58 1219 2613    7  570    8
   758  376]] [0]


In [70]:
########################################
## prepare embeddings
########################################

print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


Preparing embedding matrix
Null word embeddings: 37391


In [77]:
# embedding matrix zijn de 300 dimensies vectoren van alle voorkomende woorden
embedding_matrix.shape
# voorbeeld
embedding_matrix[1:2]

array([[ 0.08007812,  0.10498047,  0.04980469,  0.0534668 , -0.06738281,
        -0.12060547,  0.03515625, -0.11865234,  0.04394531,  0.03015137,
        -0.05688477, -0.07617188,  0.01287842,  0.04980469, -0.08496094,
        -0.06347656,  0.00628662, -0.04321289,  0.02026367,  0.01330566,
        -0.01953125,  0.09277344, -0.171875  , -0.00131989,  0.06542969,
         0.05834961, -0.08251953,  0.0859375 , -0.00318909,  0.05859375,
        -0.03491211, -0.0123291 , -0.0480957 , -0.00302124,  0.05639648,
         0.01495361, -0.07226562, -0.05224609,  0.09667969,  0.04296875,
        -0.03540039, -0.07324219,  0.03271484, -0.06176758,  0.00787354,
         0.0035553 , -0.00878906,  0.0390625 ,  0.03833008,  0.04443359,
         0.06982422,  0.01263428, -0.00445557, -0.03320312, -0.04272461,
         0.09765625, -0.02160645, -0.0378418 ,  0.01190186, -0.01391602,
        -0.11328125,  0.09326172, -0.03930664, -0.11621094,  0.02331543,
        -0.01599121,  0.02636719,  0.10742188, -0.0

In [127]:
########################################
## sample train/validation data
########################################
#np.random.seed(1234)

# Random split nemen van train data naar een train en validatie set
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
#intersect van de vragen
feat_train   = np.concatenate((train_feat[idx_train], train_feat[idx_train]))
#frequencies
q1_train     = np.concatenate((train_q1_freq[idx_train], train_q2_freq[idx_train]))
q2_train     = np.concatenate((train_q2_freq[idx_train], train_q1_freq[idx_train])) 

#print(data_1_train[0:1],data_2_train[0:1])
#print(labels_train[0:10])
#print(feat_train[0:10])
#print(q1_train[0:10])

# Zelfde voor validatieset
data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))
feat_val   = np.concatenate((train_feat[idx_val], train_feat[idx_val]))
q1_val     = np.concatenate((train_q1_freq[idx_val], train_q2_freq[idx_val]))
q2_val     = np.concatenate((train_q2_freq[idx_val], train_q1_freq[idx_val]))

#???
weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344
    

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0    13     4  2712 11042  1267
     14  1581  1267    25    63     5]] [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    2   11 6105   14
  1581 1267]]
[0 0 1 0 0 1 0 0 0 0]
[ 0  0  0  0  0  4  0  0  0  0  0  0  7  2  0  0  0  6  0  0 10  0  0  0
  0  0 26  0  0  0  0  2  0  0  0  0  0  0  0  0 11  0  0  0  0  3  0  0
  0  0  0  0  0  0 10  0  0  0  0  2  0  2  0  0  0  0  0  0  0  5  5  0
  0  1  0  0  3  8  0  0  0  0  0 22  0  1  1  0  0  0  0  0  0  0  0  0
  0  1  0 10]
[ 5  3 35  1  2  7  1  1  2  5]


In [128]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

z1 = Input(shape=(1,), dtype='float32')

a1 = Input(shape=(1,), dtype='float32')
b1 = Input(shape=(1,), dtype='float32')

merged = concatenate([x1, y1, z1, a1, b1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [129]:
########################################
## add class weight
########################################
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

In [None]:
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, z1, a1, b1], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, feat_train, q1_train, q2_train], labels_train, \
        validation_data=([data_1_val, data_2_val, feat_val, q1_val, q2_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

In [None]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

preds = model.predict([test_data_1, test_data_2, test_feat, test_q1_freq, test_q2_freq], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1, test_feat, test_q1_freq, test_q2_freq], batch_size=8192, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)