Importing the required packages

In [1]:
import tensorflow as tf

import tensorflow_hub as hub

import keras

from keras import backend as K

import numpy as np

import matplotlib.pyplot as plt

from keras.layers.experimental.preprocessing import TextVectorization

from keras.models import Sequential, Model

from keras.layers import Conv1D, MaxPooling1D, Dense, Activation, Input, LSTM, Embedding,Lambda, Bidirectional

from keras.layers.advanced_activations import LeakyReLU

import pandas as pd

import math

from sklearn.model_selection import train_test_split

Loading the dataset

In [2]:
!curl -LO https://raw.githubusercontent.com/MohamadMerchant/SNLI/master/data.tar.gz
!tar -xvzf data.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11.1M  100 11.1M    0     0  12.0M      0 --:--:-- --:--:-- --:--:-- 12.0M
SNLI_Corpus/
SNLI_Corpus/snli_1.0_dev.csv
SNLI_Corpus/snli_1.0_train.csv
SNLI_Corpus/snli_1.0_test.csv


In [2]:
df_train = pd.read_csv("SNLI_Corpus/snli_1.0_train.csv")
df_dev = pd.read_csv("SNLI_Corpus/snli_1.0_dev.csv")
df_test = pd.read_csv("SNLI_Corpus/snli_1.0_test.csv")

Preprocessing the Data:


~ Dropping rows missing values

~ Extracting one-hot vectors from the similarity column 

In [3]:
len(df_train)

550152

In [4]:
df_train.isnull().sum()

similarity    0
sentence1     0
sentence2     6
dtype: int64

In [5]:
df_train.dropna(axis=0, inplace=True)

In [6]:
df_train.similarity.value_counts()

entailment       183414
contradiction    183185
neutral          182762
-                   785
Name: similarity, dtype: int64

In [7]:
df_train = (df_train[df_train.similarity != "-"].sample(frac=1.0, random_state=42).reset_index(drop=True))

In [8]:
df_train["label"] = df_train["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
y_train = tf.keras.utils.to_categorical(df_train.label, num_classes=3)

In [9]:
len(df_train)

549361

In [10]:
df_train.head()

Unnamed: 0,similarity,sentence1,sentence2,label
0,contradiction,A woman plays a violin outdoors.,The ball room dancer slipped on a banana peel.,0
1,entailment,The red panted cyclist is amongst nature.,The cyclist is outdoors.,1
2,contradiction,A bicyclist is doing a trick in midair.,The bicycle is slowly rolling down the straight.,0
3,entailment,Two motorcyclists racing neck and neck around ...,The two motorcyclists are racing each other.,1
4,contradiction,A middle-age man in black suit speaking into t...,a guy is dancing on a table,0


In [11]:
len(df_dev)

10000

In [12]:
df_dev.isnull().sum()

similarity    0
sentence1     0
sentence2     0
dtype: int64

In [13]:
df_dev.similarity.value_counts()

entailment       3329
contradiction    3278
neutral          3235
-                 158
Name: similarity, dtype: int64

In [14]:
df_dev = (df_dev[df_dev.similarity != "-"].sample(frac=1.0, random_state=42).reset_index(drop=True))

In [15]:
df_dev["label"] = df_dev["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
y_dev = tf.keras.utils.to_categorical(df_dev.label, num_classes=3)

In [16]:
len(df_dev)

9842

In [17]:
df_dev.head()

Unnamed: 0,similarity,sentence1,sentence2,label
0,contradiction,A balding man with a checkered shirt and khaki...,A man run in playground.,0
1,contradiction,The Raiders complete the pass while the Dolphi...,The player is alone asleep in the bathtub.,0
2,contradiction,A lady sitting on a bench that is against a bu...,Nobody is sitting,0
3,contradiction,a man is swimming inside of a pool,there is a person drowning.,0
4,contradiction,A boy with a basketballs glowers at the camera.,The boy is smiling,0


In [18]:
len(df_test)

10000

In [19]:
df_test.isnull().sum()

similarity    0
sentence1     0
sentence2     0
dtype: int64

In [20]:
df_test.similarity.value_counts()

entailment       3368
contradiction    3237
neutral          3219
-                 176
Name: similarity, dtype: int64

In [21]:
df_test = (df_test[df_test.similarity != "-"].sample(frac=1.0, random_state=42).reset_index(drop=True))

In [22]:
df_test["label"] = df_test["similarity"].apply(lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2)
y_test = tf.keras.utils.to_categorical(df_test.label, num_classes=3)

In [23]:
len(df_test)

9824

In [24]:
df_test.head()

Unnamed: 0,similarity,sentence1,sentence2,label
0,entailment,"A person in sunglasses, a scarf, a shirt, and ...",A person standing,1
1,neutral,A person in a green robe sits on a couch with ...,a man sits on a couch,2
2,neutral,Three women are sitting on a green bench looki...,The three women are best friends.,2
3,contradiction,This is when daddy taught her how to ride her ...,the daddy is dead,0
4,contradiction,"There are five singers on a stage, three women...",The performers are playing bagpipes.,0


One can try to extract these similarity scores by using Pre-Trained models like BERT and Universal Sentence Encoder to extract sentence level embeddings and calculating a similarity metric, for eg the cosine similarity, and scaling it to the output range.


However, the drawback of this approach is that:

~ BERT can handle a maximum of 512 tokens in one text and if the input is larger than that it needs to be cropped in some way which might result in loss of information usefull for determining semantic meaning.

~ In case of Universal Sentence Encoder as the input text becomes larger the encoding becomes less representative and this again might result in loss of information important for determining semantic meaning.


Therefore, i demonstrate a supervised learning model (making use of pretrained GolVe embeddings, any other embedding of choice may be used) that can scale to any size of the paragraph. 

The model is in essence a CNN+BiLSTM Siamese Network, followed by calculation of L1 distance that is fed into a Dense layer for prediction.

Finding the maximum length of the input sentence/paragraph

In [25]:
MAX_LEN = 0
for sentence in list(df_train['sentence1']) + list(df_train['sentence2']):
  if len(sentence)>MAX_LEN:
    MAX_LEN = len(sentence)
print(MAX_LEN)

425


Generating the vocabulary for the model.
Defining a text vectorizer for tokenization, vectorization of input sentence.

In [26]:
vectorizer = TextVectorization(output_sequence_length = MAX_LEN)

text_ds = tf.data.Dataset.from_tensor_slices(list(df_train['sentence1']) + list(df_train['sentence2'])).batch(10)

vectorizer.adapt(text_ds)

In [27]:
#print length of vocabulary
len(vectorizer.get_vocabulary())

35972

Generating mapping from word to index of word in the vocabulary

In [28]:
vocabulary = vectorizer.get_vocabulary()

word_index = dict(zip(vocabulary, range(len(vocabulary))))

Downloading and extracting the Pre-Trained GloVe embeddings

In [70]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip -q glove.840B.300d.zip

--2021-06-05 10:23:24--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2021-06-05 10:23:24--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2021-06-05 10:23:24--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [29]:
path_to_glove_file = "glove.840B.300d.txt"

Generating a mapping from word to GloVe embedding of the word

In [None]:
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coef = coefs
        coefs = np.fromstring(coefs, "f", sep=" ")
        if coefs.shape == (0,) and word not in embeddings_index:
          for i in range(len(coef)):
            try:
              x = int(coef[i])
              break
            except:
              pass
          coefs = np.fromstring(coef[i:], "f", sep=" ")
        if coefs.shape != (0,):
          embeddings_index[word] = coefs

Preparing the embedding matrix (matrix[i] holds the embedding of i'th word in the vocabulary). Words in the vocabulary not found in the Pre-Trained GloVe embeddings are marked as all zeros.

In [None]:
num_tokens = len(vocabulary) + 2
embedding_dim = 300
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

Defining the Semantic Text Similarity extraction model

In [None]:
def make_STS_model():

  #Defining the inputs to the model
  input_left = Input(shape=(None,), dtype="int64")
  input_right = Input(shape=(None,), dtype="int64")

  embedding_layer = Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False)
  #Defining the siamese identical subnetwork

  #BiLSTM part
  lstm_sub_net = Sequential()
  lstm_sub_net(Input(shape=(None,)))
  lstm_sub_net.add(Bidirectional(LSTM(200,return_sequences=True), input_shape = (None,350)))
  lstm_sub_net.add(Bidirectional(LSTM(200)))

  #CNN part
  sub_net = Sequential()
  sub_net.add(Conv1D(50, 9, activation="relu", padding='same', input_shape = (None, 300)))

  #Printing the siamese subnetwork summary
  print("Siamese CNN subnetwork summary:")
  print(sub_net.summary())

  print("Siamese BiLSTM subnetwork summary:")
  print(lstm_sub_net.summary())

  #Extracting embeddings of each input
  input_encoded_left = embedding_layer(input_left)
  input_encoded_right = embedding_layer(input_right)

  #Extracting encoding of text1 and text2 from subnetwork
  left_encoding = sub_net(input_encoded_left)
  right_encoding = sub_net(input_encoded_right)
  left_encoding = lstm_sub_net(tf.concat([input_encoded_left, left_encoding], axis = 2))
  right_encoding = lstm_sub_net(tf.concat([input_encoded_right, right_encoding], axis = 2))
    
  #Defining the predictor network for combining the siamese subnetwork results
  L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
  L1_distance = L1_layer([left_encoding, right_encoding])
  predictor = Sequential()
  predictor.add(Dense(3))
  predictor.add(Activation('softmax'))
  prediction = predictor(L1_distance)

  #encapsulating the whole model
  model = Model(inputs=[input_left,input_right], outputs=prediction)

  #returning the model
  return model

In [None]:
model = make_STS_model()

print("Model summary:")
print(model.summary())

Preprocessing the data (converting words to word indices) and preparing the data for feeding into the model

In [None]:
X_train_1 = vectorizer(np.array([[s] for s in list(df_train['sentence1'])])).numpy()
X_train_2 = vectorizer(np.array([[s] for s in list(df_train['sentence2'])])).numpy()
X_train = [X_train_1, X_train_2]

X_dev_1 = vectorizer(np.array([[s] for s in list(df_dev['sentence1'])])).numpy()
X_dev_2 = vectorizer(np.array([[s] for s in list(df_dev['sentence2'])])).numpy()
X_dev = [X_dev_1, X_dev_2]

X_test_1 = vectorizer(np.array([[s] for s in list(df_test['sentence1'])])).numpy()
X_test_2 = vectorizer(np.array([[s] for s in list(df_test['sentence2'])])).numpy()
X_test = [X_test_1, X_test_2]

Compiling the model (defining loss function, optimizer and performance metrics)

In [None]:
model.compile(loss="categorical_crossentropy", optimizer=keras.optimizers.Adamax(learning_rate=0.01), metrics=["acc"])

Defining helper functions for learning rate scheduling

In [None]:
def lr_scheduler(epoch):
    return 0.01 * 0.9 ** epoch

class LrHistory(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs={}):
        print("Learning rate:", K.get_value(model.optimizer.lr))
model_filename = 'cifar.{0:03d}.hdf5'

Training the model

In [None]:
model.fit(
    X_train, y_train, 
    batch_size=8, 
    epochs=5,
    callbacks=[keras.callbacks.LearningRateScheduler(lr_scheduler),LrHistory()],
    validation_data=(X_dev,y_dev),
    shuffle=True,
    verbose=1,
    initial_epoch = 0
)

Saving the model

In [None]:
model.save('drive/MyDrive/AA/'+model_filename)

Loading the model

In [None]:
model = keras.models.load_model('drive/MyDrive/AA/'+model_filename)

Preparing data for evaluation and prediction

In [None]:
X_1 = vectorizer(np.array([[s] for s in list(df['text1'])])).numpy()
X_2 = vectorizer(np.array([[s] for s in list(df['text2'])])).numpy()
X = [X_1, X_2]

y = np.expand_dims(np.array(labels), axis=1)

Evaluating the model

In [None]:
model.evaluate(X, y, batch_size = 10)



[0.0010039163753390312, 0.0010039163753390312, 3.8174126148223877, 1.0]

As can be seen, there is approximately 3.82% error in prediction after training for 3 epochs. The model has potential for much better performance if trained for a greater number of epochs.

Prediction using the trained model

In [None]:
y_predicted = model.predict(X, batch_size = 10, verbose = 1)



Saving the results as DataFrame

In [None]:
result_supervised_df = pd.DataFrame(zip(list(df['Unique_ID']),list(np.array(y_predicted).squeeze())), columns = ['Unique_ID', ' Similarity_Score'])

result_supervised_df.head()

Unnamed: 0,Unique_ID,Similarity_Score
0,0,0.57488
1,1,0.596193
2,2,0.684598
3,3,0.641111
4,4,0.607438


Saving the DataFrame to CSV file

In [None]:
result_supervised_df.to_csv('drive/MyDrive/supervised_approach_results.csv', index= False)