In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/quora-question-pairs/train.csv.zip')

In [None]:
Y_train = df.iloc[:,5:].values


MAX_NB_WORDS = 200000
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(list(df['question1'].values.astype(str))+list(df['question2'].values.astype(str)))

X_train_q1 = tokenizer.texts_to_sequences(df['question1'].values.astype(str))
X_train_q1 = pad_sequences(X_train_q1, maxlen = 30, padding='post')

X_train_q2 = tokenizer.texts_to_sequences(df['question2'].values.astype(str))
X_train_q2 = pad_sequences(X_train_q2, maxlen = 30, padding='post')

word_index = tokenizer.word_index

In [None]:
embedding_index = {}
with open('../input/glove6b200d/glove.6B.200d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_index[word] = vectors
    f.close()

In [None]:
embedding_matrix = np.random.random((len(word_index)+1, 200))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Model for Q1
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization
model_q1 = tf.keras.Sequential()
model_q1.add(Embedding(input_dim = len(word_index)+1, output_dim = 200, weights = [embedding_matrix], input_length = 30))
model_q1.add(LSTM(128, activation = 'tanh', return_sequences = True))
model_q1.add(LSTM(128, activation='tanh', return_sequences = True))
model_q1.add(Dense(32, activation = 'relu'))
model_q1.add(Dense(8, activation = 'sigmoid'))


In [None]:
# Model for Q2
model_q2 = tf.keras.Sequential()
model_q2.add(Embedding(input_dim = len(word_index)+1, output_dim = 200,weights = [embedding_matrix], input_length = 30))
model_q2.add(LSTM(128, activation = 'tanh', return_sequences = True))
model_q2.add(LSTM(128, activation='tanh', return_sequences = True))
model_q2.add(Dense(32, activation = 'relu'))
model_q2.add(Dense(8, activation = 'sigmoid'))


In [None]:
#mergedOut = concatenate([model_q1.output, model_q2.output], axis=-1)
mergedOut = Multiply()([model_q1.output, model_q2.output])
mergedOut = Flatten()(mergedOut)
mergedOut = Dense(128, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.4)(mergedOut)
#mergedOut = Dense(64, activation = 'relu')(mergedOut)
mergedOut = Dense(8, activation = 'relu')(mergedOut)
mergedOut = Dense(1, activation = 'sigmoid')(mergedOut)

new_model = Model([model_q1.input, model_q2.input], mergedOut)
new_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [None]:
new_model.summary()

In [None]:
X_test_q1= X_train_q1[:40000]
X_train_q1=X_train_q1[40000:]
X_test_q2= X_train_q2[:40000]
X_train_q2=X_train_q2[40000:]
Y_test= Y_train[:40000]
Y_train=Y_train[40000:]

print(X_train_q2.shape)
print(X_train_q1.shape)
print(Y_train.shape)

In [None]:
y_test=[]
for i in Y_test:
    y_test.append(i[0])
y_test=np.array(y_test)
print(y_test)

In [None]:
#history = new_model.fit([X_train_q1,X_train_q2],Y_train, batch_size = 1024, epochs = 10)
history = new_model.fit([X_train_q1,X_train_q2],Y_train, validation_data=([X_test_q1, X_test_q2], y_test), batch_size = 1024, epochs = 10)

In [None]:
print(X_test_q2.shape)
print(X_test_q1.shape)
print(y_test.shape)

_, accuracy = new_model.evaluate([X_test_q1, X_test_q2], y_test)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
import sklearn.metrics as metrics
y_pred= new_model([X_test_q1, X_test_q2])

test=[]
y_pred=np.array(y_pred[:,0])
for i in y_pred:
    if i>=0.5:
        test.append(1)
    else:
        test.append(0)
print('Accuracy score for test: ', metrics.accuracy_score(y_test,test))
print('F1 score for test: ', metrics.f1_score(y_test, test))
