In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [None]:
# google drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data=pd.read_csv("/content/drive/MyDrive/STSbenchmark/input/train.csv")
data['score'] = data['score'] / 5.0 # normalize from 0 to 1
data.head(20)

Unnamed: 0,split,genre,dataset,year,sid,score,sentence1,sentence2
0,train,main-captions,MSRvid,2012test,1,1.0,A plane is taking off.,An air plane is taking off.
1,train,main-captions,MSRvid,2012test,4,0.76,A man is playing a large flute.,A man is playing a flute.
2,train,main-captions,MSRvid,2012test,5,0.76,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,train,main-captions,MSRvid,2012test,6,0.52,Three men are playing chess.,Two men are playing chess.
4,train,main-captions,MSRvid,2012test,9,0.85,A man is playing the cello.,A man seated is playing the cello.
5,train,main-captions,MSRvid,2012test,11,0.85,Some men are fighting.,Two men are fighting.
6,train,main-captions,MSRvid,2012test,12,0.1,A man is smoking.,A man is skating.
7,train,main-captions,MSRvid,2012test,13,0.32,The man is playing the piano.,The man is playing the guitar.
8,train,main-captions,MSRvid,2012test,14,0.44,A man is playing on a guitar and singing.,A woman is playing an acoustic guitar and sing...
9,train,main-captions,MSRvid,2012test,16,1.0,A person is throwing a cat on to the ceiling.,A person throws a cat on the ceiling.


In [None]:
# Split the data into training and validation sets
train_data = data.sample(frac=0.8, random_state=42)
val_data = data.drop(train_data.index)

In [None]:
# Preprocess the data
sentences1 = train_data['sentence1'].values
sentences2 = train_data['sentence2'].values
labels = train_data['score'].values

In [None]:
# Tokenize the sentences and convert them to sequences
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(np.concatenate((sentences1, sentences2), axis=0))
sequences1 = tokenizer.texts_to_sequences(sentences1)
sequences2 = tokenizer.texts_to_sequences(sentences2)

In [None]:
# Pad the sequences to have the same length
max_sequence_length = max(max(len(seq) for seq in sequences1), max(len(seq) for seq in sequences2))
padded_sequences1 = tf.keras.preprocessing.sequence.pad_sequences(sequences1, maxlen=max_sequence_length)
padded_sequences2 = tf.keras.preprocessing.sequence.pad_sequences(sequences2, maxlen=max_sequence_length)

In [None]:
from tensorflow.keras.layers import Reshape
from tensorflow.keras import backend as K

def pearson_correlation_loss(x):
    # Compute the Pearson correlation coefficient between the input vectors
    x1_mean = K.mean(x[0], axis=1, keepdims=True)
    x2_mean = K.mean(x[1], axis=1, keepdims=True)
    x1_std = K.std(x[0], axis=1, keepdims=True)
    x2_std = K.std(x[1], axis=1, keepdims=True)
    covariance = K.mean((x[0] - x1_mean) * (x[1] - x2_mean), axis=1, keepdims=True)
    correlation = covariance / (x1_std * x2_std + K.epsilon())

    return correlation


def siamese_rnn(vocab_size, embedding_dim, max_sequence_length):
    # Shared embedding layer
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_sequence_length)

    # Shared LSTM layer
    lstm_layer = LSTM(128)

    # Input layers
    input_1 = Input(shape=(max_sequence_length,))
    input_2 = Input(shape=(max_sequence_length,))

    # Apply embedding and LSTM
    encoded_1 = embedding_layer(input_1)
    encoded_1 = lstm_layer(encoded_1)

    encoded_2 = embedding_layer(input_2)
    encoded_2 = lstm_layer(encoded_2)

    # Pearson correlation coefficient layer
    correlation = Lambda(pearson_correlation_loss)([encoded_1, encoded_2])

    # Create the model
    model = Model(inputs=[input_1, input_2], outputs=correlation)
    return model

In [None]:
# Create the Siamese RNN model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
model = siamese_rnn(vocab_size, embedding_dim, max_sequence_length)

In [None]:
# Compile the model
model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001))

# Train the model
batch_size = 64
epochs = 10
model.fit([padded_sequences1, padded_sequences2], labels,
          batch_size=batch_size, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78152cc75390>

In [None]:
def preprocess_text(text, tokenizer, max_sequence_length):
    # Apply any necessary preprocessing steps to the input text
    # For example, tokenization, padding, etc.
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_sequence_length)
    return padded_sequence

def predict_similarity(input_text_1, input_text_2, model, tokenizer, max_sequence_length, threshold):
    # Preprocess the input texts
    input_seq_1 = preprocess_text(input_text_1, tokenizer, max_sequence_length)
    input_seq_2 = preprocess_text(input_text_2, tokenizer, max_sequence_length)

    # Perform the prediction
    prediction = model.predict([input_seq_1, input_seq_2])

    # Compare the predicted similarity score to the threshold
    print("Similarity Score:", prediction[0][0])
    if prediction >= threshold:
        return "Similar"
    else:
        return "Dissimilar"

In [None]:
# Preprocess the validation data
val_sentences1 = val_data['sentence1'].values
val_sentences2 = val_data['sentence2'].values
val_labels = val_data['score'].values

val_sequences1 = tokenizer.texts_to_sequences(val_sentences1)
val_sequences2 = tokenizer.texts_to_sequences(val_sentences2)

val_padded_sequences1 = tf.keras.preprocessing.sequence.pad_sequences(val_sequences1, maxlen=max_sequence_length)
val_padded_sequences2 = tf.keras.preprocessing.sequence.pad_sequences(val_sequences2, maxlen=max_sequence_length)

# Iterate over different threshold values and evaluate F1 score on the validation set
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
best_f1_score = 0.0
best_threshold = 0.0
scores=[]

for threshold in thresholds:
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for i in range(len(val_data)):
        prediction = predict_similarity(val_sentences1[i], val_sentences2[i], model, tokenizer, max_sequence_length, threshold)

        if prediction == "Similar":
            prediction_label = 1
        else:
            prediction_label = 0

        if prediction_label == 1 and val_labels[i] == 1:
            true_positives += 1
        elif prediction_label == 1 and val_labels[i] == 0:
            false_positives += 1
        elif prediction_label == 0 and val_labels[i] == 1:
            false_negatives += 1

    precision = true_positives / (true_positives + false_positives + 1e-7)  # Add a small epsilon value to avoid division by zero
    recall = true_positives / (true_positives + false_negatives + 1e-7)
    f1_score = 2 * (precision * recall) / (precision + recall + 1e-7)

    scores.append(f1_score)

    if f1_score > best_f1_score:
        best_f1_score = f1_score
        best_threshold = threshold

print("F1 Scores:", scores)
print("Best F1 Score:", best_f1_score)
print("Best Threshold:", best_threshold)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Similarity Score: 0.95076424
Similarity Score: 0.4590271
Similarity Score: 0.8360086
Similarity Score: 0.89035356
Similarity Score: 0.2780411
Similarity Score: 0.99916583
Similarity Score: 0.55330324
Similarity Score: 0.80383474
Similarity Score: 0.25871688
Similarity Score: 0.6781986
Similarity Score: 0.99898714
Similarity Score: 0.47029072
Similarity Score: 0.5315572
Similarity Score: 0.59547716
Similarity Score: 0.537544
Similarity Score: 0.9558389
Similarity Score: 0.9500131
Similarity Score: 0.69464284
Similarity Score: 0.6794083
Similarity Score: 0.6169964
Similarity Score: 0.5329593
Similarity Score: 0.6747968
Similarity Score: 0.37453994
Similarity Score: 0.95438015
Similarity Score: 0.5557718
Similarity Score: 0.39763004
Similarity Score: 0.8493989
Similarity Score: 0.98965544
Similarity Score: 0.4104184
Similarity Score: 0.6733608
Similarity Score: 0.96538794
Similarity Score: 0.53476787
Similarity Score: 0.3685