In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping



In [24]:
df = pd.read_csv('/kaggle/input/quora-question-pairs/train.csv.zip')

df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [25]:
# df = pd.read_csv('final Quora Question Pairs.csv')


In [26]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [4]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404282,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404283,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404284,404287,537928,537929,What is one coin?,What's this coin?,0
404285,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [27]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['question1'] + df['question2'])
vocab_size = len(tokenizer.word_index) + 1


In [7]:
import pickle
# saving
with open('Tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [28]:
vocab_size

96493

In [29]:
sequences1 = tokenizer.texts_to_sequences(df['question1'])
sequences2 = tokenizer.texts_to_sequences(df['question2'])

In [30]:
max_sequence_length = 50
padded_sequences1 = pad_sequences(sequences1, maxlen=max_sequence_length)
padded_sequences2 = pad_sequences(sequences2, maxlen=max_sequence_length)

In [31]:
X = np.hstack([padded_sequences1, padded_sequences2])
y = df['is_duplicate'].values

In [32]:
# pip install --upgrade tensorflow


In [34]:
embedding_dim = 50
lstm_units = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length * 2))
model.add(SpatialDropout1D(0.2))  # Spatial Dropout for sequences
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))  # Regular Dropout layer
model.add(Dense(units=1, activation='sigmoid'))


In [35]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [36]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           4824650   
                                                                 
 spatial_dropout1d (Spatial  (None, 100, 50)           0         
 Dropout1D)                                                      
                                                                 
 bidirectional (Bidirection  (None, 100, 200)          120800    
 al)                                                             
                                                                 
 global_max_pooling1d (Glob  (None, 200)               0         
 alMaxPooling1D)                                                 
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                        

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64, callbacks=[early_stopping])


In [None]:
model.save('Quora_question_pairs.h5')

In [None]:
# Assuming you have a new set of questions for which you want to predict duplicates
new_question1 = ["who is the prime minister of Bangladesh?"]
new_question2 = ["who is the prime minister of Pakistan?"]

# Tokenize and pad the new sequences
new_sequences1 = tokenizer.texts_to_sequences(new_question1)
new_sequences2 = tokenizer.texts_to_sequences(new_question2)

new_padded_sequences1 = pad_sequences(new_sequences1, maxlen=max_sequence_length)
new_padded_sequences2 = pad_sequences(new_sequences2, maxlen=max_sequence_length)

# Concatenate the sequences
new_X = np.hstack([new_padded_sequences1, new_padded_sequences2])

# Make predictions
predictions = model.predict(new_X)

# The predictions will be probabilities, you may want to threshold them to get binary results
threshold = 0.5
binary_predictions = (predictions > threshold).astype(int)

print("Predictions:", binary_predictions.flatten())


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
