In [None]:
import csv
import os
import pickle
import random
import re

import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt

from collections import defaultdict
from string import punctuation
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, Activation, LSTM, Lambda, Bidirectional, BatchNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import tensorflow.keras.backend as K

In [None]:
print("Num of GPUs available: ", len(tf.test.gpu_device_name()))

In [None]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

In [None]:
# tf.test.is_gpu_available()
# tf.config.list_physical_devices('GPU')

In [None]:
# !head -n 10 ../input/glove6b100dtxt/glove.6B.100d.txt

In [None]:
# data_dir = ''
data_dir = '../input/quora-question-pairs/'
train_file = data_dir + 'train.csv.zip'
test_file = data_dir + 'test.csv'

# embedding_dir = ''
embedding_dir = '../input/glovetwitter27b100dtxt/'
embedding_file = embedding_dir + 'glove.twitter.27B.200d.txt'

dump_model_dir = '../input/quora-pairs-model/'
tokenizer_file = dump_model_dir + 'tokenizer.pickle'
embedding_matrix_file = dump_model_dir + 'embedding_matrix.pickle'

In [None]:
def preprocess_text(text, lowercasing=True, remove_punctuation=False, remove_stopwords=False, stem_words=False):
    if lowercasing:
        text = text.lower()

    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"e-mail", "email", text)
    # text = re.sub(r"\'d", " would ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\?", " ! ", text)
    text = re.sub(r"\.", " . ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\#", " # ", text)
    text = re.sub(r"\(", " ( ", text)
    text = re.sub(r"\)", " ) ", text)
    text = re.sub(r"\*", " * ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    if remove_punctuation:
        text = "".join([c for c in text if c not in punctuation])
    
    if remove_stopwords:
        text = text.split()
        stop_words = set(stopwords.words("english"))
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    return text

In [None]:
train_texts1, train_texts2 = [], [] 
train_labels = []

df_train = pd.read_csv(train_file, encoding='utf-8')
df_train = df_train.fillna('empty')
train_q1 = df_train.question1.values
train_q2 = df_train.question2.values
train_labels = df_train.is_duplicate.values

pbar = tqdm(train_q1, leave=True)
for text in pbar:
    train_texts1.append(preprocess_text(text))
    
pbar = tqdm(train_q2, leave=True)
for text in pbar:    
    train_texts2.append(preprocess_text(text))

In [None]:
# test_texts1, test_texts2 = [], []
# test_ids = []

# df_test = pd.read_csv(test_file, encoding='utf-8')
# df_test = df_test.fillna('empty')
# test_q1 = df_test.question1.values
# test_q2 = df_test.question2.values
# test_ids = df_test.test_id.values

# pbar = tqdm(test_q1, leave=True)
# for text in pbar:
#     test_texts1.append(preprocess_text(text))
    
# pbar = tqdm(test_q2, leave=True)
# for text in pbar:
#     test_texts2.append(preprocess_text(text))

In [None]:
# ################################
# # Alternative dumped load  way #
# ################################

# tokenizer = None

# with open(tokenizer_file, 'rb') as handle:
#     tokenizer = pickle.load(handle)

# word_index = tokenizer.word_index
# print(f"{len(word_index)} unique tokens are found")

In [None]:
max_words = 204000

# tokenizer = Tokenizer(num_words=max_words, oov_token=-1)
tokenizer = Tokenizer(num_words=max_words)
# tokenizer.fit_on_texts(train_texts1 + train_texts2 + test_texts1 + test_texts2)
tokenizer.fit_on_texts(train_texts1 + train_texts2)

word_index = tokenizer.word_index
print(f"{len(word_index)} unique tokens are found")

In [None]:
train_sequences1 = tokenizer.texts_to_sequences(train_texts1)
train_sequences2 = tokenizer.texts_to_sequences(train_texts2)
# test_sequences1 = tokenizer.texts_to_sequences(test_texts1)
# test_sequences2 = tokenizer.texts_to_sequences(test_texts2)

print("finished")

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("dumped")

In [None]:
max_sequence_length = 64

train_data1 = pad_sequences(train_sequences1, maxlen=max_sequence_length)
train_data2 = pad_sequences(train_sequences2, maxlen=max_sequence_length)

# test_data1 = pad_sequences(test_sequences1, maxlen=max_sequence_length)
# test_data2 = pad_sequences(test_sequences2, maxlen=max_sequence_length)

print('Shape of train data:', train_data1.shape)
print('Shape of train labels:', train_labels.shape)

# print('Shape of test data:', test_data2.shape)
# print('Shape of test ids:', test_ids.shape)

In [None]:
data1_train = np.vstack((train_data1, train_data2))
data2_train = np.vstack((train_data2, train_data1))
labels_train = np.concatenate((train_labels, train_labels))

print('Shape of data1 train:', data1_train.shape)
print('Shape of data2 train:', data2_train.shape)

In [None]:
embedding_dim = 200
embeddings_index = {}

f = open(embedding_file, "r", errors='ignore', encoding='utf-8')

file_total_lines = sum(1 for line in open(embedding_file))

pbar = tqdm(f, leave=True, total=file_total_lines)

for i, line in enumerate(pbar):
    values = line.split()
    word = ''.join(values[:-embedding_dim])   
    coefs = np.asarray(values[-embedding_dim:], dtype='float32')
    embeddings_index[word] = coefs
    pbar.set_description("processing {} from {} lines".format(i, file_total_lines))

f.close()

In [None]:
# ################################
# # Alternative dumped load  way #
# ################################

# embedding_matrix = None

# with open(embedding_matrix_file, 'rb') as handle:
#     embedding_matrix = pickle.load(handle)

# print('Null word embeddings: {}'.format(np.sum(np.sum(embedding_matrix, axis=1) == 0)))

In [None]:
num_words = min(max_words, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
print('Null word embeddings: {}'.format(np.sum(np.sum(embedding_matrix, axis=1) == 0)))

In [None]:
with open('embedding_matrix.pickle', 'wb') as handle:
    pickle.dump(embedding_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("dumped")

In [None]:
re_weight = True
    
class_weight = None
samples_weight = np.ones(len(labels_train))

if re_weight:
    class_weight = {0: 1.309033281, 1: 0.471544715}
    samples_weight[labels_train==1] = 0.471544715
    samples_weight[labels_train==0] = 1.309033281
    
print(f"Done with RE { 'enabled' if re_weight else 'disabled' }")

In [None]:
# model = tf.keras.models.load_model('../input/quora-pairs-model/lstm_150_100_0.13_0.18.h5')

In [None]:
num_lstm = 250
num_dense = 150
rate_drop_lstm = 0.2
rate_drop_dense = 0.2

# not needed if already declared above
# max_words = 200000
# max_sequence_length = 64
# embedding_dim = 200
# num_words = min(max_words, len(word_index)) + 1

lstm_struct = 'lstm_{:d}_{:d}_{:.2f}_{:.2f}'.format(num_lstm, num_dense, \
    rate_drop_lstm, rate_drop_dense)

print(lstm_struct)

embedding_layer = Embedding(
    input_dim=num_words,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_sequence_length,
    trainable=False
)

bilstm_layer = Bidirectional(LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=0))

seq1 = Input(shape=(max_sequence_length,), dtype='int32')
seq2 = Input(shape=(max_sequence_length,), dtype='int32')

emb1 = embedding_layer(seq1)
emb2 = embedding_layer(seq2)

bilstm1 = bilstm_layer(emb1)
bilstm2 = bilstm_layer(emb2)

merged = Concatenate()([bilstm1, bilstm2])
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

merged = Dense(num_dense, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

preds = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[seq1, seq2], outputs=preds)

model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])

bst_model_path = lstm_struct + '.h5' 
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False)
reduce_lr_plateau = ReduceLROnPlateau(patience=10)
early_stopping = EarlyStopping(monitor='val_loss', patience=40)

In [None]:
hist = model.fit([data1_train, data2_train], labels_train, \
        validation_split=0.1, class_weight=class_weight, shuffle=True, \
        epochs=1, batch_size=2048,sample_weight=samples_weight, \
        callbacks=[model_checkpoint, reduce_lr_plateau, early_stopping])

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

In [None]:
plot_graphs(hist, 'acc')
plot_graphs(hist, 'loss')

In [None]:
print("best acc: {}".format(max(hist.history['val_acc'])))
print("worst acc: {}".format(min(hist.history['val_acc'])))

print("best loss: {}".format(min(hist.history['val_loss'])))
print("worst loss: {}".format(max(hist.history['val_loss'])))

In [None]:
model.save(lstm_struct + '_2.h5')
model.save_weights(lstm_struct + '_weights.h5')

<a href="lstm_150_100_0.13_0.18_weights.h5"> Download Weight Model File </a>

<a href="lstm_150_100_0.13_0.18_2.h5"> Download Model File </a>

<a href="tokenizer.pickle"> Download Tokenizer Pickle File </a>

In [None]:
# import pickle

# model = tf.keras.models.load_model('lstm_150_100_0.13_0.18-old.h5')

# tokenizer = None

# with open('tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

# print(f"{len(tokenizer.word_index)} unique tokens are found")

In [None]:
max_sequence_length = 64

def predict(text1, text2):
    prep1, prep2 = [preprocess_text(text1)], [preprocess_text(text2)]

    tokenized1, tokenized2 = tokenizer.texts_to_sequences(prep1), tokenizer.texts_to_sequences(prep2)
    padded1, padded2 = pad_sequences(tokenized1, maxlen=max_sequence_length), pad_sequences(tokenized2, maxlen=max_sequence_length)

    res = model.predict([padded1, padded2], batch_size=8192, verbose=1)
    res += model.predict([padded2, padded1], batch_size=8192, verbose=1)
    res /= 2

    return res


# predict("when the sun rises?", "when the sun sets?")
# predict("when benjamin franklin died?", "when adolf hitler died?")
# predict("when i wake up today?", "when i brush my teeth today?")
# predict("when i wake up today?", "when i sleep today?")
# # print(preds)