In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'max_split_size_mb:4096' #to prevent CPU max allocation error

In [None]:
!pip install sentence-transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample, SentencesDataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
import pandas as pd

In [None]:
train = pd.read_table("/content/drive/MyDrive/Colab Notebooks/stsbenchmark/sts-train.csv", on_bad_lines='skip', header=None)[[4, 5, 6]]
test = pd.read_table("/content/drive/MyDrive/Colab Notebooks/stsbenchmark/sts-test.csv", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)[[4, 5, 6]]
dev = pd.read_table("/content/drive/MyDrive/Colab Notebooks/stsbenchmark/sts-dev.csv", on_bad_lines='skip', header=None)[[4, 5, 6]]

In [None]:
train

Unnamed: 0,4,5,6
0,5.00,A plane is taking off.,An air plane is taking off.
1,3.80,A man is playing a large flute.,A man is playing a flute.
2,3.80,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,2.60,Three men are playing chess.,Two men are playing chess.
4,4.25,A man is playing the cello.,A man seated is playing the cello.
...,...,...,...
5504,1.20,"Palestinian hunger striker, Israel reach deal",Palestinian activist detained in Israeli raid
5505,4.80,Assad says Syria will comply with UN arms reso...,Syria's Assad vows to comply with U.N. resolution
5506,4.60,South Korean President Sorry For Ferry Response,S. Korean president 'sorry' for ferry disaster
5507,0.00,Food price hikes raise concerns in Iran,American Chris Horner wins Tour of Spain


In [None]:
structbert_model_name = 'bayartsogt/structbert-large'
roberta_model_name = 'sentence-transformers/stsb-roberta-base-v2'
BATCH_SIZE = 4
EPOCHS = 2
structbert_model_save_path = '/content/drive/MyDrive/Colab Notebooks/stsbenchmark/output/'+structbert_model_name
roberta_model_save_path = '/content/drive/MyDrive/Colab Notebooks/stsbenchmark/output/'+roberta_model_name

In [None]:
structbertmodel = SentenceTransformer(structbert_model_name) #pre-trained structbert

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/bayartsogt_structbert-large were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at /root/.cache/torch/sentence_transformers/bayartsogt_stru

In [None]:
robertamodel = SentenceTransformer(roberta_model_name) #pre-trained roberta base

In [None]:
#loading data from csvs
def get_samples(df):        
    dlist = []
    for score, s1, s2 in zip(df[4], df[5], df[6]):
        if type(s1) == str and type(s2) == str and type(score) ==float:
            #added type checks since 4 samples are incorrectly typed in the CSV files
            dlist.append(InputExample(texts=[s1, s2], label=float(score)/5.0))
            #normalising similarity score
    return dlist

train = get_samples(train)
test = get_samples(test)
dev = get_samples(dev)

In [None]:
train_dataloader = DataLoader(train, shuffle=True, batch_size=BATCH_SIZE)

In [None]:
#using STS evaluation module from the sentence transformers library
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev, name='sts-dev')

In [None]:
train_loss = losses.CosineSimilarityLoss(model=structbertmodel)

fine tuning the structBERT

structbert is pre-trained using both the single-sentence and sentence-pair tasks to understand inner- and inter-sentence structures better.

In [None]:
structbertmodel.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator, #using dev set as evaluator
          epochs=EPOCHS,
          evaluation_steps=1000,
          output_path=structbert_model_save_path)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1377 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1377 [00:00<?, ?it/s]

In [None]:
structbertmodel.save(structbert_model_save_path+".pt") #saving model as pytorch object

In [None]:
structbertmodel = SentenceTransformer(structbert_model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test, name='sts-test')

In [None]:
#calculating the Spearman correlation between the predicted scores and ground truth scores
test_evaluator(structbertmodel, output_path=structbert_model_save_path)

0.8624807586393146

In [None]:
#testing trained encoder
structbertmodel.encode(["first sentence", "second sentence"])

array([[ 0.06496187, -0.1636802 ,  0.08490301, ..., -0.05080017,
         0.41919583,  0.59681857],
       [ 0.6488059 , -0.19953668,  0.53966045, ..., -0.1571952 ,
         0.5590603 ,  0.25005576]], dtype=float32)

fine-tuning RoBERTa

roberta found bert to be severely undertrained, and made it more robust by training it further.

In [None]:
train_loss = losses.CosineSimilarityLoss(model=robertamodel)

In [None]:
robertamodel.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=EPOCHS,
          evaluation_steps=1000,
          output_path=roberta_model_save_path)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1377 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1377 [00:00<?, ?it/s]

In [None]:
robertamodel.save(roberta_model_save_path+".pt")  #saving model as pytorch object

In [None]:
robertamodel = SentenceTransformer(roberta_model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test, name='sts-test')

In [None]:
#calculating the Spearman correlation between the predicted scores and ground truth scores
test_evaluator(robertamodel, output_path=roberta_model_save_path)

0.8895914982090101

In [None]:
robertamodel.encode(["first sentence", "second sentence"])

array([[-0.8311274 , -0.27338457,  0.7927528 , ..., -0.87827265,
         0.5500811 , -0.79679465],
       [-0.8113476 ,  0.19782084,  0.68522835, ..., -0.9067161 ,
         0.7473745 , -0.8184176 ]], dtype=float32)

Taking ensemble model predictions

In [None]:
test = pd.read_table("/content/drive/MyDrive/Colab Notebooks/stsbenchmark/sts-test.csv", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)[[4, 5, 6]]

testset = []
for score, s1, s2 in zip(test[4], test[5], test[6]):
    if type(s1) == str or type(s2) == str or type(score) ==float:
        testset.append({"s1": s1, 
                        "s2": s2, 
                        "score": float(score)/5.0
                        })

In [None]:
import numpy as np
from numpy.linalg import norm
def cos_sim(A, B): #cosine similarity
    return np.dot(A,B)/(norm(A)*norm(B))

In [None]:
predictions = []
for pair in testset:
    #get sentence embeddings from both models
    structbert_encodings = structbertmodel.encode([pair['s1'], pair['s2']])
    roberta_encodings = robertamodel.encode([pair['s1'], pair['s2']])
    
    #calculate cosine similarity
    structbert_cos_sim = cos_sim(structbert_encodings[0], structbert_encodings[1])
    roberta_cos_sim = cos_sim(roberta_encodings[0], roberta_encodings[1])

    #taking average prediction of similarity from the two models
    predictions.append({"pred": (structbert_cos_sim + roberta_cos_sim)/2,
                        "gt": pair['score']})
    

In [None]:
from scipy.stats import spearmanr
spearmanr([i['pred'] for i in predictions], [i['gt'] for i in predictions])

SpearmanrResult(correlation=0.895600166273939, pvalue=0.0)