In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [2]:
# google drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data=pd.read_csv("/content/drive/MyDrive/STSbenchmark/input/train.csv")
data['score'] = data['score'] / 5.0 # normalize from 0 to 1
data.head(20)

Unnamed: 0,split,genre,dataset,year,sid,score,sentence1,sentence2
0,train,main-captions,MSRvid,2012test,1,1.0,A plane is taking off.,An air plane is taking off.
1,train,main-captions,MSRvid,2012test,4,0.76,A man is playing a large flute.,A man is playing a flute.
2,train,main-captions,MSRvid,2012test,5,0.76,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,train,main-captions,MSRvid,2012test,6,0.52,Three men are playing chess.,Two men are playing chess.
4,train,main-captions,MSRvid,2012test,9,0.85,A man is playing the cello.,A man seated is playing the cello.
5,train,main-captions,MSRvid,2012test,11,0.85,Some men are fighting.,Two men are fighting.
6,train,main-captions,MSRvid,2012test,12,0.1,A man is smoking.,A man is skating.
7,train,main-captions,MSRvid,2012test,13,0.32,The man is playing the piano.,The man is playing the guitar.
8,train,main-captions,MSRvid,2012test,14,0.44,A man is playing on a guitar and singing.,A woman is playing an acoustic guitar and sing...
9,train,main-captions,MSRvid,2012test,16,1.0,A person is throwing a cat on to the ceiling.,A person throws a cat on the ceiling.


In [4]:
# Split the data into training and validation sets
train_data = data.sample(frac=0.8, random_state=42)
val_data = data.drop(train_data.index)

In [5]:
# Preprocess the data
sentences1 = train_data['sentence1'].values
sentences2 = train_data['sentence2'].values
labels = train_data['score'].values

In [6]:
# Tokenize the sentences and convert them to sequences
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(np.concatenate((sentences1, sentences2), axis=0))
sequences1 = tokenizer.texts_to_sequences(sentences1)
sequences2 = tokenizer.texts_to_sequences(sentences2)

In [7]:
# Pad the sequences to have the same length
max_sequence_length = max(max(len(seq) for seq in sequences1), max(len(seq) for seq in sequences2))
padded_sequences1 = tf.keras.preprocessing.sequence.pad_sequences(sequences1, maxlen=max_sequence_length)
padded_sequences2 = tf.keras.preprocessing.sequence.pad_sequences(sequences2, maxlen=max_sequence_length)

In [8]:
from tensorflow.keras.layers import Reshape
from tensorflow.keras import backend as K

def jaccard_similarity_loss(x):
    # Compute Jaccard similarity between the vectors
    intersection = K.sum(K.minimum(x[0], x[1]), axis=1)
    union = K.sum(K.maximum(x[0], x[1]), axis=1)
    jaccard_similarities = intersection / union
    return K.reshape(jaccard_similarities, (-1, 1))

def siamese_rnn(vocab_size, embedding_dim, max_sequence_length):
    # Shared embedding layer
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_sequence_length)

    # Shared LSTM layer
    lstm_layer = LSTM(128)

    # Input layers
    input_1 = Input(shape=(max_sequence_length,))
    input_2 = Input(shape=(max_sequence_length,))

    # Apply embedding and LSTM
    encoded_1 = embedding_layer(input_1)
    encoded_1 = lstm_layer(encoded_1)

    encoded_2 = embedding_layer(input_2)
    encoded_2 = lstm_layer(encoded_2)

    # Jaccard similarity layer
    similarity = Lambda(jaccard_similarity_loss)([encoded_1, encoded_2])

    # Create the model
    model = Model(inputs=[input_1, input_2], outputs=similarity)
    return model

In [9]:
# Create the Siamese RNN model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
model = siamese_rnn(vocab_size, embedding_dim, max_sequence_length)

In [10]:
# Compile the model
model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001))

# Train the model
batch_size = 64
epochs = 10
model.fit([padded_sequences1, padded_sequences2], labels,
          batch_size=batch_size, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x781808d62530>

In [11]:
def preprocess_text(text, tokenizer, max_sequence_length):
    # Apply any necessary preprocessing steps to the input text
    # For example, tokenization, padding, etc.
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_sequence_length)
    return padded_sequence

def predict_similarity(input_text_1, input_text_2, model, tokenizer, max_sequence_length, threshold):
    # Preprocess the input texts
    input_seq_1 = preprocess_text(input_text_1, tokenizer, max_sequence_length)
    input_seq_2 = preprocess_text(input_text_2, tokenizer, max_sequence_length)

    # Perform the prediction
    prediction = model.predict([input_seq_1, input_seq_2])

    # Compare the predicted similarity score to the threshold
    print("Similarity Score:", prediction[0][0])
    if prediction >= threshold:
        return "Similar"
    else:
        return "Dissimilar"

In [12]:
import sklearn.metrics
# Preprocess the validation data
val_sentences1 = val_data['sentence1'].values
val_sentences2 = val_data['sentence2'].values
val_labels = val_data['score'].values
print(val_labels[:3])

val_sequences1 = tokenizer.texts_to_sequences(val_sentences1)
val_sequences2 = tokenizer.texts_to_sequences(val_sentences2)

val_padded_sequences1 = tf.keras.preprocessing.sequence.pad_sequences(val_sequences1, maxlen=max_sequence_length)
val_padded_sequences2 = tf.keras.preprocessing.sequence.pad_sequences(val_sequences2, maxlen=max_sequence_length)

# Iterate over different threshold values and evaluate F1 score on the validation set
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
best_auc_score = 0.0
scores=[]

fpr_list = []
tpr_list = []

for threshold in thresholds:
    y_pred = [predict_similarity(s1, s2, model, tokenizer, max_sequence_length, threshold) for s1, s2 in zip(val_sentences1, val_sentences2)]
    y_pred_labels = [1 if p == "Similar" else 0 for p in y_pred]
    val_labels_binary = [1 if score >= threshold else 0 for score in val_labels]

    fpr, tpr, _ = sklearn.metrics.roc_curve(val_labels_binary, y_pred_labels)
    fpr_list.append(fpr)
    tpr_list.append(tpr)

    # Calculate AUC using the current threshold
    auc_score = sklearn.metrics.auc(fpr, tpr)
    scores.append(auc_score)

    # Update best threshold if the current AUC-ROC score is higher
    if auc_score > best_auc_score:
        best_auc_score = auc_score
        best_threshold = threshold

print(scores)
print("Best Threshold based on AUC-ROC:", best_threshold)
print("AUC-ROC Score:", best_auc_score)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Similarity Score: 0.9068502
Similarity Score: 0.5339142
Similarity Score: 0.7001614
Similarity Score: 0.82562345
Similarity Score: 0.22123134
Similarity Score: 1.0
Similarity Score: 0.61676157
Similarity Score: 0.6980363
Similarity Score: 0.3860786
Similarity Score: 0.53847754
Similarity Score: 1.0
Similarity Score: 0.36579302
Similarity Score: 0.48032597
Similarity Score: 0.58547515
Similarity Score: 0.49046314
Similarity Score: 0.85922253
Similarity Score: 0.8809597
Similarity Score: 0.43442002
Similarity Score: 0.4966488
Similarity Score: 0.5048833
Similarity Score: 0.5318565
Similarity Score: 0.5578401
Similarity Score: 0.4499001
Similarity Score: 0.81278586
Similarity Score: 0.53909767
Similarity Score: 0.6268062
Similarity Score: 0.52107793
Similarity Score: 0.84535307
Similarity Score: 0.35909903
Similarity Score: 0.63010716
Similarity Score: 0.93405455
Similarity Score: 0.58415663
Similarity Score: 0.22108077
Simi