In [1]:
!pip install contractions
import warnings
warnings.filterwarnings('ignore')

import gc
import re
import numpy as np
import pandas as pd
import unicodedata
import contractions
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, LSTM
from tensorflow.keras.layers import concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow.keras.backend as K



In [2]:
train_path = '/kaggle/input/quora-question-pairs/train.csv.zip'
test_path = '/kaggle/input/quora-question-pairs/test.csv'
embeddings_path = '/kaggle/input/glove-840b-300d/glove.840B.300d.txt'

In [3]:
embeddings_dict = {}
embedding_size = 300

with open(embeddings_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = ''.join(values[:-embedding_size])   
        embeddings_dict[word] = np.asarray(values[-embedding_size:], dtype='float32')

print(f'{len(embeddings_dict)} embedding vectors are read successfully')

2195892 embedding vectors are read successfully


In [28]:
train_df = pd.read_csv(train_path, encoding='utf-8')
train_df = train_df.fillna('Empty')
train_labels = train_df.is_duplicate.values

In [5]:
test_df = pd.read_csv(test_path, encoding='utf-8')
test_df = test_df.fillna('Empty')
test_ids = test_df.test_id.values

#  Pre prosessing AND Feature Extraction  

In [6]:
def text_preprocess(text, remove_stopwords=False, stem_words=False):
    text = text.lower().split()

    pattern = re.compile('<.*?>')
    text = pattern.sub(r'', str(text))
    
    pattern = re.compile(r'https?://\S+|www\.\S+')
    text = pattern.sub(r'', str(text))
    
    text = text.translate(text.maketrans("\n\t\r", "   "))
    
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') \
                                              .decode('utf-8', 'ignore')
    text = text.strip()
    
    text = contractions.fix(text)
    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    text = special_char_pattern.sub(" \\1 ", text)    
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text).strip()
    
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        text = [w for w in text if not w in stop_words]
    
    text = " ".join(text)
    
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    return(text)


In [7]:
train_preprocessed_q1 = [] 
train_preprocessed_q2 = []

for text in train_df.question1.values:
    train_preprocessed_q1.append(text_preprocess(text, remove_stopwords=False, stem_words=False))
    
for text in train_df.question2.values:
    train_preprocessed_q2.append(text_preprocess(text, remove_stopwords=False, stem_words=False))

In [8]:
test_preprocessed_q1 = []
test_preprocessed_q2 = []

for text in test_df.question1.values:
    test_preprocessed_q1.append(text_preprocess(text, remove_stopwords=False, stem_words=False))
    
for text in test_df.question2.values:
    test_preprocessed_q2.append(text_preprocess(text, remove_stopwords=False, stem_words=False))

In [9]:
max_tokens = 200000
tokenizer = Tokenizer(num_words=max_tokens)
tokenizer.fit_on_texts(train_preprocessed_q1 + train_preprocessed_q2 + test_preprocessed_q1 + test_preprocessed_q2)

train_sequences_1 = tokenizer.texts_to_sequences(train_preprocessed_q1)
train_sequences_2 = tokenizer.texts_to_sequences(train_preprocessed_q2)
test_sequences_1 = tokenizer.texts_to_sequences(test_preprocessed_q1)
test_sequences_2 = tokenizer.texts_to_sequences(test_preprocessed_q2)
word_index = tokenizer.word_index
print('{} unique tokens are found'.format(len(word_index)))

36 unique tokens are found


In [10]:
del train_preprocessed_q1
del train_preprocessed_q2
del test_preprocessed_q1
del test_preprocessed_q2
gc.collect()

0

In [11]:
max_seq_len = 60
train_embeddings_q1 = pad_sequences(train_sequences_1, maxlen=max_seq_len)
train_embeddings_q2 = pad_sequences(train_sequences_2, maxlen=max_seq_len)
print('Shape of train embeddings: ', train_embeddings_q1.shape)
print('Shape of train labels: ', train_labels.shape)

test_embeddings_q1 = pad_sequences(test_sequences_1, maxlen=max_seq_len)
test_embeddings_q2 = pad_sequences(test_sequences_2, maxlen=max_seq_len)
print('Shape of test embeddings: ', test_embeddings_q2.shape)
print('Shape of test ids: ', test_ids.shape)

Shape of train embeddings:  (404290, 60)
Shape of train labels:  (404290,)
Shape of test embeddings:  (2345796, 60)
Shape of test ids:  (2345796,)


In [12]:
del train_sequences_1
del train_sequences_2
del test_sequences_1
del test_sequences_2
gc.collect()

0

In [13]:
# graph features
questions = pd.concat([train_df[['question1', 'question2']],
                       test_df[['question1', 'question2']]],
                       axis=0).reset_index(drop='index')

q_adj_list = defaultdict(set)
for i in range(questions.shape[0]):
    q_adj_list[questions.question1[i]].add(questions.question2[i])
    q_adj_list[questions.question2[i]].add(questions.question1[i])

In [14]:
def q1_freq(row):
    """Number of adjacent nodes."""
    return (len(q_adj_list[row['question1']]))
    
def q2_freq(row):
    """Number of adjacent nodes."""
    return (len(q_adj_list[row['question2']]))
    
def q1_q2_intersect(row):
    """Number of adjacent nodes common to q1 and q2."""
    return (len(set(q_adj_list[row['question1']]).intersection(set(q_adj_list[row['question2']]))))

In [15]:
len(q_adj_list["Which one dissolve in water quikly sugar, salt... "])

0

In [35]:
train_df['q1_q2_intersect'] = train_df.apply(q1_q2_intersect, axis=1)
train_df['q1_freq'] = train_df.apply(q1_freq, axis=1)
train_df['q2_freq'] = train_df.apply(q2_freq, axis=1)

test_df['q1_q2_intersect'] = test_df.apply(q1_q2_intersect, axis=1)
test_df['q1_freq'] = test_df.apply(q1_freq, axis=1)
test_df['q2_freq'] = test_df.apply(q2_freq, axis=1)

train_graph_feat = train_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']].copy()
test_graph_feat = test_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']].copy()

In [17]:
# del train_df
# del test_df
# gc.collect()

0

In [33]:
# # 1) length of the bothe question
# train_df['q1_len'] = train_df['question1'].str.len()
# train_df['q2_len'] = train_df['question2'].str.len()


# 2) Add number of words columns 

train_df['q1_words'] = train_df['question1'].apply(lambda row: len(row.split(" ")))
train_df['q2_words'] = train_df['question2'].apply(lambda row: len(row.split(" ")))


# 3) Number of same words in the pair of questions
def common_words(row):
    w1 = set(map(lambda word : word.lower().strip(),row['question1'].split(" ")))
    w2 = set(map(lambda word : word.lower().strip(),row['question2'].split(" ")))
    return len(w1 & w2)

train_df['Common_words'] = train_df.apply(common_words,axis=1)
train_df.head()

# 4)total Number of  words in the all of 2 questions

def word_total(row):
    q1 = set(map(lambda word : word.lower().strip(),row['question1'].split(" ")))
    q2 = set(map(lambda word : word.lower().strip(),row['question2'].split(" ")))

    return len(q1)+len(q2)
train_df['total_words'] = train_df.apply(word_total,axis=1)


# for word share
def word_share(row):
    return round(row['Common_words']/row['total_words'],2)
train_df['word_share'] = train_df.apply(word_share,axis=1)

# ML model

In [36]:
x = train_df[[ 'q1_q2_intersect','q1_freq','q2_freq', 'q1_words','q2_words',  'word_share']]

In [37]:
y=train_df.is_duplicate

In [38]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=41
                                                    )

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_leaf=5)
rf.fit(x_train,y_train)

y_pred = rf.predict(x_test)

In [40]:

# Make predictions on the training data
y_train_pred = rf.predict(x_train)

# Calculate the accuracy
accuracy = accuracy_score(y_train, y_train_pred)
accuracy

0.8642505371501141

In [41]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.93      0.86      0.89     90575
           1       0.74      0.85      0.79     42841

    accuracy                           0.86    133416
   macro avg       0.83      0.86      0.84    133416
weighted avg       0.87      0.86      0.86    133416



# lstm

In [19]:
num_words = min(max_tokens, len(word_index))+1
embedding_matrix = np.zeros((num_words, embedding_size))

for word, i in word_index.items():
    embedding_vector = embeddings_dict.get(word)
    
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [20]:
del embeddings_dict
gc.collect()

488

In [21]:
scaler = StandardScaler()
scaler.fit(np.vstack((train_graph_feat, test_graph_feat)))
train_graph_feat = scaler.transform(train_graph_feat)
test_graph_feat = scaler.transform(test_graph_feat)

In [22]:
val_ratio = 0.1
perm = np.random.permutation(len(train_embeddings_q1))
idx_train = perm[:int(len(train_embeddings_q1)*(1-val_ratio))]
idx_val = perm[int(len(train_embeddings_q1)*(1-val_ratio)):]

data_1_train = np.vstack((train_embeddings_q1[idx_train], train_embeddings_q2[idx_train]))
data_2_train = np.vstack((train_embeddings_q2[idx_train], train_embeddings_q1[idx_train]))
graph_train = np.vstack((train_graph_feat[idx_train], train_graph_feat[idx_train]))
labels_train = np.concatenate((train_labels[idx_train], train_labels[idx_train]))

data_1_val = np.vstack((train_embeddings_q1[idx_val], train_embeddings_q2[idx_val]))
data_2_val = np.vstack((train_embeddings_q2[idx_val], train_embeddings_q1[idx_val]))
graph_val = np.vstack((train_graph_feat[idx_val], train_graph_feat[idx_val]))
labels_val = np.concatenate((train_labels[idx_val], train_labels[idx_val]))

In [23]:
weight_val = np.ones(len(labels_val))
weight_val *= 0.471544715
weight_val[labels_val == 0] = 1.309033281

In [24]:
emb_layer = Embedding(
    input_dim=num_words,
    output_dim=embedding_size,
    weights=[embedding_matrix],
    input_length=max_seq_len,
    trainable=False
)    

lstm_layer = LSTM(200, dropout=0.15, recurrent_dropout=0.15)

seq1 = Input(shape=(max_seq_len,), dtype='int32')
seq2 = Input(shape=(max_seq_len,), dtype='int32')

emb1 = emb_layer(seq1)
emb2 = emb_layer(seq2)

lstm_a = lstm_layer(emb1)
lstm_b = lstm_layer(emb2)

graph_inp = Input(shape=(train_graph_feat.shape[1],))
graph_dense = Dense(75, activation='relu')(graph_inp)

merged = concatenate([lstm_a, lstm_b, graph_dense])
merged = BatchNormalization()(merged)
merged = Dropout(0.3)(merged)

merged = Dense(150, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dropout(0.3)(merged)

preds = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[seq1, seq2, graph_inp], outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])

In [25]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
best_model_path = 'lstm.h5' 
class_weight = {0: 1.309033281, 1: 0.471544715}

model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, graph_train], labels_train, \
        validation_data=([data_1_val, data_2_val, graph_val], labels_val, weight_val), \
        epochs=15, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [26]:
model.load_weights(best_model_path)
best_val_score = min(hist.history['val_loss'])
best_val_acc = max(hist.history['val_acc'])
print(best_val_score, best_val_acc)

0.21095313131809235 0.8736426830291748


In [27]:
predictions = model.predict([test_embeddings_q1, test_embeddings_q2, test_graph_feat], batch_size=2048, verbose=1)
predictions += model.predict([test_embeddings_q2, test_embeddings_q1, test_graph_feat], batch_size=2048, verbose=1)
predictions /= 2



# DL model

In [29]:
dtrain=np.hstack((train_embeddings_q1,train_embeddings_q2))
dtrain=pad_sequences(dtrain,maxlen=116)

print("dtrain" ,dtrain.shape)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dtrain,
                                                    train_labels,
                                                    test_size=0.33,
                                                    random_state=41
                                                    )

T=dtrain.shape[1]
word2idx=tokenizer.word_index
V=len(word2idx)
D=20

dtrain (404290, 116)


In [32]:
from keras.models import Model
from keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense, Activation, BatchNormalization, Input, Embedding, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
from keras import regularizers

# Define regularization and early stopping parameters
l2_value = 0.02  # L2 regularization strength
dropout_rate = 0.5  # Dropout rate
patience = 3  # Number of epochs with no improvement before stopping

# Model architecture
i = Input(shape=(T,))
x = Embedding(V+1, D)(i)
x = Conv1D(32, 3, activation='relu', kernel_regularizer=regularizers.l2(l2_value))(x)
x = MaxPooling1D(4)(x)
x = Dropout(dropout_rate)(x)
x = Conv1D(64, 3, activation='relu', kernel_regularizer=regularizers.l2(l2_value))(x)
x = MaxPooling1D(4)(x)
x = Dropout(dropout_rate)(x)
x = Conv1D(128, 3, activation='relu', kernel_regularizer=regularizers.l2(l2_value))(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(l2_value))(x)

model = Model(i, x)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=patience, verbose=1)

# Train the model with early stopping
model.fit(x_train, y_train,
          epochs=100,
          validation_data=(x_test, y_test),
          callbacks=[early_stopping])

# Print model summary
model.summary()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 5: early stopping
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 116)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 116, 20)           740       
                                                                 
 conv1d (Conv1D)             (None, 114, 32)           1952      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 28, 32)           0         
 )                                                               
                                                                 
 dropout_2 (Dropout)         (None, 28, 32)            0         
                                                                 
 conv1d_1 (Conv1D)           (None, 26, 6