In [2]:
print('Importing packages...')

import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

from keras.layers import Merge
from keras.layers import TimeDistributed, Lambda
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text

import h5py # Used to save models

Importing packages...


In [3]:
print('Reading Data...')

# Output directory
train_loc = 'local' # floyd or local
num_epochs = 1  # Originally used 200

if train_loc == 'floyd':
    output = 'output/'
    data_loc = 'data/'
else:
    output = '../output_lstm/'
    data_loc = '../data/'

# Import data
csv_train = data_loc + 'train.csv'
data = pd.read_csv(csv_train)
y = data.is_duplicate.values

# Tokenize text
num_words = 200000  # The total number of word to incorperates in the search
tk = text.Tokenizer(num_words=num_words)  # Create a tokenizer object

max_len = 40 # The maximum length of a sequence
tk.fit_on_texts(list(data.question1.values) +
                list(data.question2.values.astype(str)))

x1 = tk.texts_to_sequences(data.question1.values)
x1 = sequence.pad_sequences(x1,maxlen=max_len)

x2 = tk.texts_to_sequences(data.question2.values.astype(str))
x2 = sequence.pad_sequences(x2,maxlen=max_len)

word_index = tk.word_index

ytrain_enc = np_utils.to_categorical(y)

Reading Data...


In [4]:
print('Generating embeddings...')

embeddings_index = {}
f = open(data_loc + 'glove.840B.300d.txt')
unfound = []
unfound_vals = []
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        unfound.append(word)
        unfound_vals.append(values[1:])
f.close()

print('Found %s word vectors.' % len(embeddings_index))

print('Creating embedding matrix...')

embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


max_features = 200000
filter_length = 5
nb_filter = 64
pool_length = 4

1837it [00:00, 18369.27it/s]

Generating embeddings...


2196017it [01:58, 18455.24it/s]
 18%|█▊        | 17551/95603 [00:00<00:00, 175508.08it/s]

Found 2196016 word vectors.
Creating embedding matrix...


100%|██████████| 95603/95603 [00:00<00:00, 164365.70it/s]


In [5]:
print('Build model...')

# Question 1 - Embeddings -> LSTM

model5 = Sequential()
model5.add(Embedding(len(word_index) + 1, 300, input_length=40, dropout=0.2))
model5.add(LSTM(300, dropout_W=0.2, dropout_U=0.2))

# Question 2 - Embeddings -> LSTM

model6 = Sequential()
model6.add(Embedding(len(word_index) + 1, 300, input_length=40, dropout=0.2))
model6.add(LSTM(300, dropout_W=0.2, dropout_U=0.2))

# Merge all models - MERGE ALL QUESTIONS

merged_model = Sequential()
merged_model.add(Merge([model5, model6], mode='concat'))
merged_model.add(BatchNormalization())


# Feed Forward Network
# merged_model.add(Dense(300))
# merged_model.add(PReLU())
# merged_model.add(Dropout(0.2))
# merged_model.add(BatchNormalization())
#
# merged_model.add(Dense(300))
# merged_model.add(PReLU())
# merged_model.add(Dropout(0.2))
# merged_model.add(BatchNormalization())
#
# merged_model.add(Dense(300))
# merged_model.add(PReLU())
# merged_model.add(Dropout(0.2))
# merged_model.add(BatchNormalization())

# Final node gives binary output
merged_model.add(Dense(1))
merged_model.add(Activation('sigmoid'))

  
  import sys


Build model...


  if sys.path[0] == '':
  del sys.path[0]


In [6]:
# Loss, Optimizer, Accuracy
merged_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Save checkpoints
checkpoint = ModelCheckpoint(output+'weights.h5', monitor='val_acc', save_best_only=True, verbose=2)

In [7]:
# Train model on x1 and x2
merged_model.fit([x1, x2], y=y, batch_size=384, nb_epoch=num_epochs,
                 verbose=1, validation_split=0.1, shuffle=True, callbacks=[checkpoint])



Train on 363861 samples, validate on 40429 samples
Epoch 1/1


<keras.callbacks.History at 0x12f181fd0>

In [9]:
# Save Model

# serialize model to JSON
model_json = merged_model.to_json()
with open(output + "model.json", "w") as json_file:
    json_file.write(model_json)

# Load Model

In [4]:
from keras.models import model_from_json

# Load Model

# load json and create model
json_file = open(output+'model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights(output+"weights.h5")
print("Loaded model from disk")


# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
score = loaded_model.evaluate([x1,x2], y, verbose=1)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

  return cls(**config)


Loaded model from disk


In [5]:
# Import data
csv_test = data_loc + 'test.csv'
test_data = pd.read_csv(csv_test)

# # Tokenize text
# num_words = 200000  # The total number of word to incorperates in the search
# tk = text.Tokenizer(num_words=num_words)  # Create a tokenizer object

# max_len = 40 # The maximum length of a sequence
# tk.fit_on_texts(list(data.question1.values) +
#                 list(data.question2.values.astype(str)))

t_x1 = tk.texts_to_sequences(test_data.question1.values.astype(str))
t_x1 = sequence.pad_sequences(t_x1,maxlen=max_len)

t_x2 = tk.texts_to_sequences(test_data.question2.values.astype(str))
t_x2 = sequence.pad_sequences(t_x2,maxlen=max_len)

In [7]:
p_test = loaded_model.predict([t_x1,t_x2],batch_size=100,verbose=1)

   1600/2345796 [..............................] - ETA: 6792s

KeyboardInterrupt: 

In [None]:
# Predict
df_test = pd.read_csv(data_loc+'test.csv')

sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv(output + 'shitty_lstm.csv', index=False)