## Please Upvote this notebook if you like the approach 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
df_train = pd.read_csv("../input/quora-question-pairs/train.csv.zip")

In [None]:
df_train.head(5)

In [None]:
len(df_train)

In [None]:
df_train = df_train[df_train['question1'].apply(lambda x: isinstance(x, str))]
df_train = df_train[df_train['question2'].apply(lambda x: isinstance(x, str))]

In [None]:
len(df_train)

### meta-features
* number of common tokens(non-stopwords) in both the questions
* tokens count difference

In [None]:
import re, string, six

from nltk.corpus import stopwords
import pandas as pd
import numpy as np

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

def clean_text(s):
    try:
        return re.sub(r'[^A-Za-z0-9,?"\'. ]+', '', s).encode('utf-8').decode('utf-8').lower()
    except:
        return ""

stops = set(stopwords.words("english"))

def word_match_share(row):
    q1words = {}
    q2words = {}
    try:
        for word in tokenize(row['question1']):
            if word not in stops:
                q1words[word] = 1
        for word in tokenize(row['question2']):
            if word not in stops:
                q2words[word] = 1
        if len(q1words) == 0 or len(q2words) == 0:
            return 0
        shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
        shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
        return (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    except:
        return 0

def word_count_diff(row):
    try:
        q1words = len(list(filter(lambda x: x.lower() not in stops, tokenize(row['question1']))))
        q2words = len(list(filter(lambda x: x.lower() not in stops, tokenize(row['question2']))))
        return abs(q1words - q2words)
    except:
        return 50

In [None]:
df_train['wms'] = df_train.apply(word_match_share, axis=1)
df_train['wcd'] = df_train.apply(word_count_diff, axis=1)

In [None]:
df_train.head()

In [None]:
df_train.groupby(['is_duplicate']).agg({'wcd': np.mean}).reset_index()

In [None]:
df_train['question1'] = df_train['question1'].apply(lambda x: clean_text(x))
df_train['question2'] = df_train['question2'].apply(lambda x: clean_text(x))

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

hub_url = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
embed = hub.KerasLayer(hub_url, trainable=False)

In [None]:
def euc_dist(x, y):
    return np.sqrt(np.dot((x-y), (x-y)))

In [None]:
X_train_q1 = df_train['question1'].tolist()
X_train_q2 = df_train['question2'].tolist()
X_wms = df_train['wms'].tolist()
X_wcd = df_train['wcd'].tolist()
y_train = (1-df_train['is_duplicate']).tolist()

In [None]:
len(X_train_q1)

In [None]:
from sklearn.model_selection import train_test_split

X_train_q1, X_test_q1, X_train_q2, X_test_q2, X_wms_train, X_wms_test, X_wcd_train, X_wcd_test, y_train, y_test = train_test_split(X_train_q1, X_train_q2, X_wms, X_wcd, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [None]:
X_wcd_train[:4]

In [None]:
import tensorflow as tf
# tf.config.run_functions_eagerly(False)
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

In [None]:
input1 = Input(shape=(), dtype=tf.string)
input2 = Input(shape=(), dtype=tf.string)
input_wms = Input(shape=(1,), dtype=tf.float16)
input_wcd = Input(shape=(1,), dtype=tf.float16)

embed1 = embed(input1)
embed2 = embed(input2)

dist = Lambda(lambda x: K.sqrt(K.sum(K.square(x[0] - x[1]), axis=-1, keepdims=True)))([embed1,embed2])

concat = Concatenate(axis=1)([dist, input_wms, input_wcd])

hidden = Dense(9, activation="relu", kernel_regularizer=l2(1e-4))(concat)

out = Dense(1, activation="sigmoid", kernel_regularizer=l2(1e-4))(hidden)
model = Model(inputs=[input1, input2, input_wms, input_wcd], outputs=out)

In [None]:
model.summary()

In [None]:
model.compile(optimizer=Adam(1e-3), loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# callbacks defined

# learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.003
    drop = 0.5
    epochs_drop = 3
    lrate = initial_lrate * (drop**((1 + epoch)/epochs_drop))
    return lrate

lrate_scheduler = LearningRateScheduler(step_decay)
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
model_chkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

model.fit(x=[np.array(X_train_q1), np.array(X_train_q2), np.array(X_wms_train), np.array(X_wcd_train)],
          y=np.array(y_train),
          batch_size=128,
          epochs=5,
          validation_data=([np.array(X_test_q1), np.array(X_test_q2), np.array(X_wms_test), np.array(X_wcd_test)], np.array(y_test)),
          callbacks=[lrate_scheduler, early_stop, model_chkpoint])

In [None]:
df_test = pd.read_csv("../input/quora-question-pairs/test.csv")

In [None]:
df_test.head()

In [None]:
df_test['wms'] = df_test.apply(word_match_share, axis=1)
df_test['wcd'] = df_test.apply(word_count_diff, axis=1)

In [None]:
df_test['question1'] = df_test['question1'].apply(lambda x: clean_text(x))
df_test['question2'] = df_test['question2'].apply(lambda x: clean_text(x))

In [None]:
df_test.head()

In [None]:
X_test_q1 = df_test['question1'].tolist()
X_test_q2 = df_test['question2'].tolist()
X_test_wms = df_test['wms'].tolist()
X_test_wcd = df_test['wcd'].tolist()

In [None]:
from tqdm import tqdm
preds = []
batch_size = 512
steps = len(X_test_q1) // batch_size + 1
for i in tqdm(range(0, steps)):
    X_test_q1_batch = np.array(X_test_q1[i*batch_size: i*batch_size+batch_size])
    X_test_q2_batch = np.array(X_test_q2[i*batch_size: i*batch_size+batch_size])
    X_test_wms_batch = np.array(X_test_wms[i*batch_size: i*batch_size+batch_size])
    X_test_wcd_batch = np.array(X_test_wcd[i*batch_size: i*batch_size+batch_size])
    preds.extend(model.predict([X_test_q1_batch, X_test_q2_batch, X_test_wms_batch, X_test_wcd_batch]))

In [None]:
preds = [1 - x[0] for x in preds]

In [None]:
df_test['is_duplicate'] = preds

In [None]:
df_test = df_test.drop(['question1', 'question2', 'wms', 'wcd'], axis=1)

In [None]:
df_test.set_index('test_id').to_csv("submission.csv")