# Paraphrase-Detection using Feature Fusion Network
Paraphrase detection is the task of examining two text entities (ex. sentence) and determining whether they have the same meaning. In order to obtain high accuracy on this task, thorough syntactic and semantic analysis of the two text entities is required.

## What is Paraphrase?
In simple words, paraphrase is just an alternative representation of the same meaning.

![text_similarity.png](text_similarity.png)

## Quora Question Pairs Dataset
There are over 400,000 lines of potential question duplicate pairs. Each line contains IDs for each question in the pair, the full text for each question, and a binary value that indicates whether the line truly contains a duplicate pair.

We can download dataset from [Quora Question Pairs Dataset](https://www.kaggle.com/quora/question-pairs-dataset)

In [None]:
# Importing packages
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import string
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Permute, dot, add, concatenate
from tensorflow.keras.layers import Embedding,Input, Dense, Dropout, Reshape, BatchNormalization, TimeDistributed, Lambda, Concatenate,concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,ReduceLROnPlateau
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,ReduceLROnPlateau
from sklearn.metrics import accuracy_score, f1_score, log_loss, confusion_matrix
import spacy
!pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
sp = spacy.load('en_core_web_sm')
# import nltk
# nltk.download("punkt")

## Read Dataset

In [None]:
project_path = 'paraphrase_detection/'

In [None]:
data = pd.read_csv(project_path+"questions.csv",nrows=10000)
data.head()

## Preprocess Data

In [None]:
# prepare translation table for removing punctuation
table = str.maketrans('', '', string.punctuation)
def clean_question(text):
    doc = sp(text)
    # tokenize
    # text = text.split()
    # Lemmatization
    text = [token.lemma_ for token in doc]
    # convert to lower case
    text = [word.lower() for word in text]
    # remove punctuation from each token
    text = [w.translate(table) for w in text]
    # remove hanging 's' and 'a'
    text = [word for word in text if len(word)>1]
    # remove tokens with numbers in them
    text = [word for word in text if word.isalpha()]
    # store as string
    return ' '.join(text)

In [None]:
data["question1"] = data["question1"].apply(lambda x:clean_question(x))
data["question2"] = data["question2"].apply(lambda x:clean_question(x))

In [None]:
data.head()

## Feature Extraction

In [None]:
# fit a tokenizer with questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data["question1"].values+data["question2"].values)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

In [None]:
# create sequences
max_len = 100
q1_texts_seq = tokenizer.texts_to_sequences(data["question1"].values)
q2_texts_seq = tokenizer.texts_to_sequences(data["question2"].values)

q1_texts_seq = pad_sequences(q1_texts_seq,maxlen=max_len)
q2_texts_seq = pad_sequences(q2_texts_seq,maxlen=max_len)

In [None]:
if os.path.isdir('glove') == False:
    os.mkdir('glove')

glove_dir = "datasets/"
from zipfile import ZipFile
with ZipFile(glove_dir+'glove.zip', 'r') as z:
  z.extractall("glove")

In [None]:
# Load Glove vectors
embeddings_index = {} # empty dictionary
f = open(os.path.join("glove/", 'glove.6B.200d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [None]:
X = np.stack((q1_texts_seq, q2_texts_seq), axis=1)
y = data["is_duplicate"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Get Question 1/2  train and test features
q1_X_train = X_train[:,0]
q2_X_train = X_train[:,1]

q1_X_test = X_test[:,0]
q2_X_test = X_test[:,1]

In [None]:
## Define custon metrics
def f1_score(y_true, y_pred):
    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0 or c2 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / (c2 + K.epsilon())

    # How many relevant items are selected?
    recall = c1 / (c3 + K.epsilon())

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return f1_score


def precision(y_true, y_pred):
    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    if c2 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / (c2 + K.epsilon())

    return precision


def recall(y_true, y_pred):
    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many relevant items are selected?
    recall = c1 / (c3 + K.epsilon())

    return recall

## Build the model
Paraphrase Detection using **Feature Fusion Network**

In [None]:
def create_model(input_shape,embeddings_dim, embeddings_matrix, vocab_size, max_seq_length, trainable_embeddings, dropout, hidden_units):
   
    # TODO: Add docstring
    X1_input = Input(input_shape, name="input_X1")
    X2_input = Input(input_shape, name="input_X2")

    # Encoding the inputs using the same weights
    # Output shape: (batch_size, max_seq_length, lstm_hidden_units)
    embeddor = Embedding(vocab_size, embeddings_dim, weights=[embeddings_matrix], input_length=max_seq_length, trainable=trainable_embeddings)(X1_input)
    td = TimeDistributed(Dense(embeddings_dim, activation='relu'))(embeddor)
    ld = Lambda(lambda x: K.sum(x, axis=1), output_shape=(embeddings_dim, ))(td)

    embeddor1 = Embedding(vocab_size, embeddings_dim, weights=[embeddings_matrix], input_length=max_seq_length, trainable=trainable_embeddings)(X2_input)
    td1 = TimeDistributed(Dense(embeddings_dim, activation='relu'))(embeddor1)
    ld1 = Lambda(lambda x: K.sum(x, axis=1), output_shape=(embeddings_dim, ))(td1)

    cat = concatenate([ld,ld1])
    X = Dense(hidden_units, activation="relu")(cat)
    X = Dropout(dropout)(X)
    X = Dense(hidden_units, activation="relu")(X)
    X = Dropout(dropout)(X)
    X = Dense(hidden_units, activation="relu")(X)
    X = Dropout(dropout)(X)
    X = Dense(hidden_units, activation="relu")(X)
    X = Dropout(dropout)(X)
    X = Dense(1, activation="sigmoid", name="output")(X)

    model = Model(inputs=[X1_input, X2_input], outputs=X, name="GRN_model")

    optimizer = optimizers.Adam()
    # optimizer = optimizers.RMSprop()
    model.compile(optimizer=optimizer,
                loss="binary_crossentropy",
                metrics=['accuracy', precision, recall, f1_score])
    return model

In [None]:
dropout = 0.2
trainable_embeddings = False
hidden_units = 200
input_shape = (max_len,)
model = create_model(input_shape,embedding_dim, embedding_matrix, vocab_size, max_len, trainable_embeddings, dropout, hidden_units)
model.summary()

In [None]:
plot_model(model, to_file='model.png', show_shapes=True)

## Train the model

In [None]:
# Defining a helper function to save the model after each epoch 
# in which the loss decreases 
filepath = project_path+'model_paraprase_detection_pad_FFN.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# Defining a helper function to reduce the learning rate each time 
# the learning plateaus 
reduce_alpha = ReduceLROnPlateau(monitor ='val_loss', factor = 0.2, patience = 1, min_lr = 0.001)
# stop traning if there increase in loss
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)
callbacks = [checkpoint,es,reduce_alpha] 

In [None]:
epochs = 100
batch_size = 64
history = model.fit(x=[q1_X_train, q2_X_train],
                    y=y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=([q1_X_test, q2_X_test], y_test),callbacks=callbacks)

## Training Summary

In [None]:
# list all data in history
print("All data in history: ", history.history.keys())

In [None]:
# summarize history for accuracy
fig = plt.figure()
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
fig.savefig('model_accuracy.png')

In [None]:
# summarize history for loss
fig = plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
fig.savefig('model_loss.png')

In [None]:
# summarize history for precision
fig = plt.figure()
plt.plot(history.history['precision'])
plt.plot(history.history['val_precision'])
plt.title('model precision')
plt.ylabel('precision')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
fig.savefig('model_precision.png')

In [None]:
# summarize history for recall
fig = plt.figure()
plt.plot(history.history['recall'])
plt.plot(history.history['val_recall'])
plt.title('model recall')
plt.ylabel('recall')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
fig.savefig('model_recall.png')

In [None]:
# summarize history for f1 score
fig = plt.figure()
plt.plot(history.history['f1_score'])
plt.plot(history.history['val_f1_score'])
plt.title('model f1_score')
plt.ylabel('f1_score')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
fig.savefig('model_f1_score.png')

## Test the model

In [None]:
print('Testing Data Metrics:')
loss, accuracy, precision, recall, f1_score = model.evaluate([q1_X_test, q2_X_test], y_test)
print('')
print('loss      = {0:.4f}'.format(loss))
print('accuracy  = {0:.4f}'.format(accuracy))
print('precision = {0:.4f}'.format(precision))
print('recall    = {0:.4f}'.format(recall))
print('F1         = {0:.4f}'.format(f1_score))