In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore")

In [39]:
df = pd.read_csv('train_data.csv')
df.sample(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
227578,227578,336474,336475,Can ground state and noble gas configuration b...,How is ground state and noble gas configuratio...,1
197602,197602,260456,67638,How do I increase my follower on Quora?,How can I increase my followers on Quora?,1
308001,308001,431778,431779,How do reptiles mate?,Reptiles: Why does this lizard have two tails?,0


In [40]:
df.shape

(404290, 6)

In [41]:
new_df = df.sample(50000,random_state=2)

In [42]:
def preprocess(q):
    
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
      "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
     "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
     "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
     # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q
    

In [43]:
new_df['question1'] = new_df['question1'].apply(preprocess)
new_df['question2'] = new_df['question2'].apply(preprocess)

In [44]:
new_df['q1_len'] = new_df['question1'].str.len() 
new_df['q2_len'] = new_df['question2'].str.len()
new_df['q1_num_words'] = new_df['question1'].apply(lambda row: len(row.split(" ")))
new_df['q2_num_words'] = new_df['question2'].apply(lambda row: len(row.split(" ")))

In [45]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return len(w1 & w2)
new_df['word_common'] = new_df.apply(common_words, axis=1)

def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return (len(w1) + len(w2))
new_df['word_total'] = new_df.apply(total_words, axis=1)

new_df['word_share'] = round(new_df['word_common']/new_df['word_total'],2)

#### Advanced features

In [46]:
from nltk.corpus import stopwords

def fetch_token_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    SAFE_DIV = 0.0001 

    STOP_WORDS = stopwords.words("english")
    
    token_features = [0.0]*8
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features

token_features = new_df.apply(fetch_token_features, axis=1)

new_df["cwc_min"]       = list(map(lambda x: x[0], token_features))
new_df["cwc_max"]       = list(map(lambda x: x[1], token_features))
new_df["csc_min"]       = list(map(lambda x: x[2], token_features))
new_df["csc_max"]       = list(map(lambda x: x[3], token_features))
new_df["ctc_min"]       = list(map(lambda x: x[4], token_features))
new_df["ctc_max"]       = list(map(lambda x: x[5], token_features))
new_df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
new_df["first_word_eq"] = list(map(lambda x: x[7], token_features))

In [47]:
import nltk
from nltk.metrics import edit_distance

def fetch_length_features(row):
    q1 = row['question1']
    q2 = row['question2']
    
    length_features = [0.0] * 3
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features
    
    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    
    # Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2
    
    # Longest common substring using edit distance
    lcs_length = len(q1) + len(q2) - 2 * edit_distance(q1, q2)
    length_features[2] = lcs_length / (min(len(q1), len(q2)) + 1)
    
    return length_features

length_features = new_df.apply(fetch_length_features, axis=1)

new_df['abs_len_diff'] = list(map(lambda x: x[0], length_features))
new_df['mean_len'] = list(map(lambda x: x[1], length_features))
new_df['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))


In [48]:
# Fuzzy Features
from fuzzywuzzy import fuzz

def fetch_fuzzy_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    fuzzy_features = [0.0]*4
    
    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)

    # fuzz_partial_ratio
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

    # token_set_ratio
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

    return fuzzy_features

In [49]:
fuzzy_features = new_df.apply(fetch_fuzzy_features, axis=1)

# Creating new feature columns for fuzzy features
new_df['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
new_df['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))
new_df['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))
new_df['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))

In [50]:
print(new_df.shape)
new_df.head(2)

(50000, 28)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
398782,398782,496695,532029,what is the best marketing automation tool for...,what is the best marketing automation tool for...,1,75,76,13,13,...,0.92307,1.0,1.0,0.0,13.0,1.960526,99,99,99,99
115086,115086,187729,187730,i am poor but i want to invest what should i do,i am quite poor and i want to be very rich wh...,0,48,56,13,16,...,0.466664,1.0,1.0,3.0,13.5,1.142857,69,67,65,74


In [51]:
from sklearn.preprocessing import MinMaxScaler

X = MinMaxScaler().fit_transform(new_df[['cwc_min', 'cwc_max', 'csc_min', 'csc_max' , 'ctc_min' , 'ctc_max' , 'last_word_eq', 'first_word_eq' , 'abs_len_diff' , 'mean_len' , 'token_set_ratio' , 'token_sort_ratio' ,  'fuzz_ratio' , 'fuzz_partial_ratio' , 'longest_substr_ratio']])
y = new_df['is_duplicate'].values

In [52]:
ques_df = new_df[['question1','question2']]
ques_df.sample(5)

Unnamed: 0,question1,question2
264303,what is the best social media strategy for bus...,what is the best social media marketing for bu...
256335,how can one stroke adjustments in a crank and ...,what are differences between crank and slotted...
325920,why are not using star delta connections belo...,what is a star delta starter
393882,how can i lose weight loss,how can i lose my weight from 55 kg to 50 kg w...
77816,how do astrologers make money,is mary the astrologer real


In [53]:
final_df = new_df.drop(columns=['id','qid1','qid2','question1','question2'])
print(final_df.shape)

(50000, 23)


In [54]:
from gensim.models import Word2Vec
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])
tokenized_questions = [q.split() for q in questions]
#training skip-gram model 
w2v_model = Word2Vec(tokenized_questions, vector_size=100, window=5, min_count=1,sg=1) 

In [55]:
def question_to_vec(question, model):
    """Convert a question to a Word2Vec vector by averaging the word vectors of the words in the question."""
    words = question.split()
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if len(word_vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vecs, axis=0)

# Convert questions to vectors
q1_vecs = ques_df['question1'].apply(lambda x: question_to_vec(x, w2v_model)).tolist()
q2_vecs = ques_df['question2'].apply(lambda x: question_to_vec(x, w2v_model)).tolist()

# Convert lists to numpy arrays
q1_arr = np.array(q1_vecs)
q2_arr = np.array(q2_vecs)


In [56]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(50000, 200)

In [57]:
new_final_df = new_df.drop(columns=['id','qid1','qid2','question1','question2'])
print(new_final_df.shape)

(50000, 23)


In [58]:
new_final_df = pd.concat([new_final_df, temp_df], axis=1)
print(new_final_df.shape)
new_final_df.head(2)

(50000, 223)


Unnamed: 0,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,cwc_min,cwc_max,...,90,91,92,93,94,95,96,97,98,99
398782,1,75,76,13,13,12,26,0.46,0.874989,0.874989,...,0.429559,0.133027,0.084331,0.155802,0.616565,0.262933,-0.091163,-0.436433,-0.001523,0.143887
115086,0,48,56,13,16,8,24,0.33,0.666644,0.499988,...,0.298244,0.177807,-0.151539,-0.007481,0.739199,0.48144,0.144904,-0.289914,0.399156,-0.136223


In [82]:
new_final_df.iloc[:,1:24]

Unnamed: 0,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,cwc_min,cwc_max,csc_min,...,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio,0
398782,75,76,13,13,12,26,0.46,0.874989,0.874989,0.999980,...,1.0,1.0,0.0,13.0,1.960526,99,99,99,99,-0.048990
115086,48,56,13,16,8,24,0.33,0.666644,0.499988,0.714276,...,1.0,1.0,3.0,13.5,1.142857,69,67,65,74,-0.249889
327711,104,119,28,21,4,38,0.11,0.000000,0.000000,0.428565,...,0.0,0.0,6.0,23.0,0.390476,26,29,34,43,-0.129071
367788,58,145,14,32,1,34,0.03,0.000000,0.000000,0.000000,...,0.0,0.0,17.0,21.5,-0.288136,29,41,23,30,0.040026
151235,34,49,5,9,3,13,0.23,0.749981,0.599988,0.000000,...,1.0,0.0,4.0,7.0,1.000000,55,70,48,69,-0.030425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362916,57,25,10,5,2,15,0.13,0.333322,0.249994,0.499975,...,0.0,1.0,5.0,7.5,0.076923,46,56,44,61,0.039915
292183,29,45,7,11,3,17,0.18,0.499988,0.399992,0.333322,...,0.0,1.0,4.0,9.0,0.600000,43,55,57,67,-0.266411
360529,24,28,5,5,4,10,0.40,0.666644,0.666644,0.999950,...,0.0,1.0,0.0,5.0,1.600000,85,92,69,86,0.061085
327759,79,49,15,11,8,26,0.31,0.599988,0.428565,0.799984,...,0.0,1.0,4.0,12.0,0.720000,59,61,63,80,-0.083856


In [59]:
len(w2v_model.wv.key_to_index)

33445

In [60]:
w2v_model.vector_size

100

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate

# Vocabulary size and embedding dimension
vocab_size = len(w2v_model.wv.key_to_index) + 1  # Adding 1 for padding index
embedding_dim = 100

# Input layers
q1_input = Input(shape=(100,))  # Shape for Q1 word indices
q2_input = Input(shape=(100,))  # Shape for Q2 word indices
features_input = Input(shape=(22,))  # Shape for engineered features

# Embedding layer shared for both questions
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=100)
q1_embedded = embedding_layer(q1_input)
q2_embedded = embedding_layer(q2_input)

# LSTM layers
lstm_layer = LSTM(units=128, dropout=0.2, recurrent_dropout=0.2)
q1_lstm = lstm_layer(q1_embedded)
q2_lstm = lstm_layer(q2_embedded)

# Concatenate LSTM outputs and engineered features
concatenated = Concatenate()([q1_lstm, q2_lstm, features_input])

# Dense layers
dense_1 = Dense(units=128, activation='relu')(concatenated)
dense_2 = Dense(units=256, activation='relu')(dense_1)
dense_3 = Dense(units=128, activation='relu')(dense_2)

# Output layer
output = Dense(units=1, activation='sigmoid')(dense_3)

# Define the model
model = Model(inputs=[q1_input, q2_input, features_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
print(model.summary())

# Assuming `X` is your input data and `y` is your target
# Split your data accordingly
X_features = X[:, :22]
X_q1 = X[:, 22:123]  # First 100 columns
X_q2 = X[:, 123:]  # Next 100 columns
  # Last 22 columns

# Train the model
model.fit([X_q1, X_q2, X_features], y, epochs=5, batch_size=32, validation_split=0.2)


In [76]:
# Define the maximum sequence length (after padding)
max_seq_length = 100  # Set based on your data

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(w2v_model.wv.key_to_index)+1, output_dim=w2v_model.vector_size, input_length=222))
model.add(Bidirectional(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
from tensorflow.keras.optimizers import Adam
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

model.build(input_shape=(None, max_seq_length))

# Print model summary
print(model.summary())

None


In [62]:

X = new_final_df.iloc[:,1:].values
y = new_final_df.iloc[:,0].values

In [83]:
X[:, :22]

array([[ 75.,  76.,  13., ...,  99.,  99.,  99.],
       [ 48.,  56.,  13., ...,  67.,  65.,  74.],
       [104., 119.,  28., ...,  29.,  34.,  43.],
       ...,
       [ 24.,  28.,   5., ...,  92.,  69.,  86.],
       [ 79.,  49.,  15., ...,  61.,  63.,  80.],
       [ 38.,  49.,   8., ...,  66.,  60.,  76.]])

In [68]:
X.shape

(50000, 222)

In [75]:
y.shape

(50000,)

In [77]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)
# Fit the model
model.fit(X, y, epochs=20, batch_size=32, validation_split=0.2,callbacks=[early_stopping, reduce_lr])

Epoch 1/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 137ms/step - accuracy: 0.6560 - loss: 0.5852 - val_accuracy: 0.6870 - val_loss: 0.5317 - learning_rate: 0.0010
Epoch 2/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 137ms/step - accuracy: 0.6811 - loss: 0.5366 - val_accuracy: 0.6941 - val_loss: 0.5297 - learning_rate: 0.0010
Epoch 3/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 138ms/step - accuracy: 0.6937 - loss: 0.5251 - val_accuracy: 0.6858 - val_loss: 0.5298 - learning_rate: 0.0010
Epoch 4/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 134ms/step - accuracy: 0.6954 - loss: 0.5232 - val_accuracy: 0.7003 - val_loss: 0.5228 - learning_rate: 0.0010
Epoch 5/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 133ms/step - accuracy: 0.6971 - loss: 0.5219 - val_accuracy: 0.7040 - val_loss: 0.5178 - learning_rate: 0.0010
Epoch 6/20
[1m1250/1250[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x21fa9487090>