In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tqdm.auto import tqdm

tqdm.pandas()
pd.options.display.max_colwidth = None
sns.set_style('darkgrid')

In [None]:
dtrain = pd.read_csv('../input/quora-question-pairs/train.csv.zip')
print(dtrain.shape)
dtrain.head()

In [None]:
dtest = pd.read_csv('../input/quora-question-pairs/test.csv')
print(dtest.shape)
dtest.head()

# Text Cleaning

In [None]:
%%time

all_ques = pd.read_csv('../input/qqp-cleaned/quora-ques-pair-all-ques.csv')
all_ques.head()

In [None]:
%%time

text_map = {x:y for x, y in zip(all_ques['RawText'].values, all_ques['CleanedText'].values)}

dtrain['question1'] = dtrain['question1'].apply(lambda x: text_map[x])
dtrain['question2'] = dtrain['question2'].apply(lambda x: text_map[x])

dtest['question1'] = dtest['question1'].apply(lambda x: text_map[x])
dtest['question2'] = dtest['question2'].apply(lambda x: text_map[x])

del text_map

# Cross Validation

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.01, random_state=19)
train_index, valid_index = list(sss.split(dtrain[['question1', 'question2']].values, dtrain['is_duplicate']))[0]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

def evaluate_model(model, x_train, x_valid, y_train, y_valid):
    print('Train Set:')
    print()
    print(classification_report(y_train, model.predict(x_train)))
    
    print()
    print()
    
    print('Validation Set:')
    print()
    print(classification_report(y_valid, model.predict(x_valid)))

# Vectorization

In [None]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    ngram_range=(1, 1),
    min_df=1,
    max_df=1.0,
    sublinear_tf=True
).fit(all_ques['CleanedText'].fillna('').values)

x_train = dtrain[['question1', 'question2']].iloc[train_index].reset_index(drop=True)
x_valid = dtrain[['question1', 'question2']].iloc[valid_index].reset_index(drop=True)

y_train = dtrain['is_duplicate'].iloc[train_index].reset_index(drop=True).values
y_valid = dtrain['is_duplicate'].iloc[valid_index].reset_index(drop=True).values

del all_ques

y_train.shape, y_valid.shape

In [None]:
def sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))

In [None]:
%%time

from scipy import sparse

x_train_1 = vectorizer.transform(x_train['question1'].fillna(''))
x_train_2 = vectorizer.transform(x_train['question2'].fillna(''))

x_valid_1 = vectorizer.transform(x_valid['question1'].fillna(''))
x_valid_2 = vectorizer.transform(x_valid['question2'].fillna(''))

x_test_1 = vectorizer.transform(dtest['question1'].fillna(''))
x_test_2 = vectorizer.transform(dtest['question2'].fillna(''))

x_train = [sparse_tensor(x_train_1), sparse_tensor(x_train_2)]
x_valid = [sparse_tensor(x_valid_1), sparse_tensor(x_valid_2)]
x_test = [sparse_tensor(x_test_1), sparse_tensor(x_test_2)]

x = [
    sparse_tensor(sparse.vstack([x_train_1, x_valid_1])), 
    sparse_tensor(sparse.vstack([x_train_2, x_valid_2]))
]
y = np.concatenate([y_train, y_valid])

del x_train_1, x_train_2, x_valid_1, x_valid_2, x_test_1, x_test_2

x_train[0].shape, x_valid[0].shape, x_test[0].shape

# Modelling

In [None]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import layers, utils, callbacks, optimizers, regularizers

In [None]:
def euclidean_distance(vectors):
    (featsA, featsB) = vectors
    sumSquared = K.sum(K.square(featsA - featsB), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sumSquared, K.epsilon()))

def cosine_similarity(vectors):
    (featsA, featsB) = vectors
    featsA = K.l2_normalize(featsA, axis=-1)
    featsB = K.l2_normalize(featsB, axis=-1)
    return K.mean(featsA * featsB, axis=-1, keepdims=True)

In [None]:
class SiameseNetwork(Model):
    def __init__(self, inputShape, featExtractorConfig):
        super(SiameseNetwork, self).__init__()
        
        inpA = layers.Input(shape=inputShape)
        inpB = layers.Input(shape=inputShape)
        featureExtractor = self.build_feature_extractor(inputShape, featExtractorConfig)
        featsA = featureExtractor(inpA)
        featsB = featureExtractor(inpB)
        distance = layers.Concatenate()([featsA, featsB])
        outputs = layers.Dense(1, activation="sigmoid")(distance)
        self.model = Model(inputs=[inpA, inpB], outputs=outputs)        
        
    def build_feature_extractor(self, inputShape, featExtractorConfig):
        
        layers_config = [layers.Input(inputShape)]
        for i, n_units in enumerate(featExtractorConfig):
            layers_config.append(layers.Dense(n_units))
            layers_config.append(layers.Dropout(0.5))
            layers_config.append(layers.BatchNormalization())
            layers_config.append(layers.Activation('relu'))
        
        model = Sequential(layers_config, name='feature_extractor')

        return model  
        
    def call(self, x):
        return self.model(x)

model = SiameseNetwork(inputShape=x_train[0].shape[1], featExtractorConfig=[100])
model.compile(
    loss="binary_crossentropy", 
    optimizer=optimizers.Adam(learning_rate=0.0001),
    metrics=["accuracy"]
)

In [None]:
model.model.layers[2].summary()
model.model.summary()
utils.plot_model(model.model, show_shapes=True, expand_nested=True)

In [None]:
es = callbacks.EarlyStopping(
    monitor='val_loss', patience=5, verbose=1, restore_best_weights=True
)

rlp = callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.1, patience=2, min_lr=1e-10, mode='min', verbose=1
)

history = model.fit(
    x_train, y_train,
    validation_data=(x_valid, y_valid),
    batch_size=32, 
    epochs=100,
    callbacks=[es, rlp]
)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(20, 8))
df = pd.DataFrame(history.history)
df[['accuracy', 'val_accuracy']].plot(ax=ax[0])
df[['loss', 'val_loss']].plot(ax=ax[1])
ax[0].set_title('Model Accuracy', fontsize=12)
ax[1].set_title('Model Loss', fontsize=12)
fig.suptitle('Siamese Network: Learning Curve', fontsize=18);

In [None]:
%%time

submission = pd.DataFrame({
    'test_id': dtest.test_id.values,
    'is_duplicate': np.ravel(model.predict(x_test, batch_size=32))
})
submission.to_csv('submission.csv', index=False)