In [None]:
! pip install -q transformers datasets

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/looseversion-1.3.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/lightning_utilities-0.12.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/nvfuser-0.2.25a0+6627725-py3.12-linux-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/pyth

# Import Libraries

In [3]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AdamW
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
from scipy.stats import spearmanr

  from .autonotebook import tqdm as notebook_tqdm


# Datasets

In [4]:
def get_sts_dataset(dataset_name, split=0.3):
    if dataset_name == 'STS-B':
        dataset_name = 'stsbenchmark'
    elif dataset_name == 'SICK-R':
        dataset_name = 'sickr'
    dataset = load_dataset(f'mteb/{dataset_name.lower()}-sts', split='test')
    dataset = dataset.rename_column('score', 'labels')
    dataset_split = dataset.train_test_split(test_size=split)
    train_dataset, test_dataset = dataset_split['train'], dataset_split['test']
    return train_dataset, test_dataset

sts_datasets = ['STS-B', 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'SICK-R']

In [5]:
class STSDataset(torch.utils.data.Dataset):
    def __init__(self, sentence1, sentence2, label):
        self.sentence1 = sentence1
        self.sentence2 = sentence2
        self.label = label

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return self.sentence1[idx], self.sentence2[idx], self.label[idx]

# Model

In [6]:
model_id = 'bert-base-uncased'

# Loss Functions

## Embedding Losses

#### Cosine Similarity MSE Loss

##### Normalization

In [7]:
def divided_by_maximum(labels):
    return labels / torch.max(labels)

In [8]:
def sigmoid(labels):
    labels = np.array(labels)
    return 1 / (1 + np.exp(-labels))

In [9]:
def norm_function(norm, labels):
    return globals()[norm](labels)

##### Loss

In [10]:
def cosine_similarity_mse_loss(embedding1, embedding2, labels):
    # Calculating the cosine similarity between the pairs of embeddings...
    cos_sim = F.cosine_similarity(embedding1, embedding2)

    # MSE loss...
    squared_difference = (labels - cos_sim) ** 2
    loss = squared_difference.mean()

    return loss

In [11]:
def cosine_similarity_mse_norm(embedding1, embedding2, labels, norm):
    labels_norm = norm_function(norm, labels)
    # Calculating the cosine similarity between the pairs of embeddings...
    cos_sim = F.cosine_similarity(embedding1, embedding2)

    # MSE loss...
    squared_difference = (labels_norm - cos_sim) ** 2
    loss = squared_difference.mean()

    return loss

#### CoSENT Loss

In [12]:
def cosent_loss(embedding1, embedding2, labels, tau=20.0):
    # Input preparation...
    labels = (labels[:, None] < labels[None, :]).float()

    # Normalization of Logits...
    embedding1 = F.normalize(embedding1, p=2, dim=1)
    embedding2 = F.normalize(embedding2, p=2, dim=1)

    # Cosine Similarity Calculation...
    # The dot product of these pairs gives the cosine similarity, scaled by a factor of tau to control the sharpness of similarity scores...
    y_pred = torch.sum(embedding1 * embedding2, dim=1) * tau

    # Pairwise cosine similarity difference calculation...
    y_pred = y_pred[:, None] - y_pred[None, :]

    y_pred = (y_pred - (1 - labels) * 1e12).view(-1)

    zero = torch.Tensor([0]).to(y_pred.device)
    y_pred = torch.concat((zero, y_pred), dim=0)
    return torch.logsumexp(y_pred, dim=0)

#### In-Batch Negatives Loss

In [13]:
def categorical_crossentropy(y_true, y_pred):
    return -(F.log_softmax(y_pred, dim=1) * y_true).sum(dim=1)

def in_batch_negative_loss(embedding1, embedding2, labels, tau=20.0, negative_weights=0.0):
    device = labels.device
    y_pred = torch.empty((2 * embedding1.shape[0], embedding1.shape[1]), device=device)
    y_pred[0::2] = embedding1
    y_pred[1::2] = embedding2
    y_true = labels.repeat_interleave(2).unsqueeze(1)

    def make_target_matrix(y_true):
        idxs = torch.arange(0, y_pred.shape[0]).int().to(device)
        y_true = y_true.int()
        idxs_1 = idxs[None, :]
        idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]

        idxs_1 *= y_true.T
        idxs_1 += (y_true.T == 0).int() * -2

        idxs_2 *= y_true
        idxs_2 += (y_true == 0).int() * -1

        y_true = (idxs_1 == idxs_2).float()
        return y_true

    neg_mask = make_target_matrix(y_true == 0)

    y_true = make_target_matrix(y_true)

    y_pred = F.normalize(y_pred, dim=1, p=2)
    similarities = y_pred @ y_pred.T
    similarities = similarities - torch.eye(y_pred.shape[0]).to(device) * 1e12
    similarities = similarities * tau

    if negative_weights > 0:
        similarities += neg_mask * negative_weights

    return categorical_crossentropy(y_true, similarities).mean()

#### Angle Loss

In [14]:
def angle_loss(embedding1, embedding2, labels, tau=1.0):
    # Input preparation...
    labels = (labels[:, None] < labels[None, :]).float()

    # Chunking into real and imaginary parts...
    y_pred_re1, y_pred_im1 = torch.chunk(embedding1, 2, dim=1)
    y_pred_re2, y_pred_im2 = torch.chunk(embedding2, 2, dim=1)

    a = y_pred_re1
    b = y_pred_im1
    c = y_pred_re2
    d = y_pred_im2

    z = torch.sum(c**2 + d**2, dim=1, keepdim=True)
    re = (a * c + b * d) / z
    im = (b * c - a * d) / z

    dz = torch.sum(a**2 + b**2, dim=1, keepdim=True)**0.5
    dw = torch.sum(c**2 + d**2, dim=1, keepdim=True)**0.5
    re /= (dz / dw)
    im /= (dz / dw)

    y_pred = torch.concat((re, im), dim=1)
    y_pred = torch.abs(torch.sum(y_pred, dim=1)) * tau
    y_pred = y_pred[:, None] - y_pred[None, :]
    y_pred = (y_pred - (1 - labels) * 1e12).view(-1)
    zero = torch.Tensor([0]).to(y_pred.device)
    y_pred = torch.concat((zero, y_pred), dim=0)
    return torch.logsumexp(y_pred, dim=0)

#### Combination of CoSENT, In-Batch Negatives and Angle Losses

In [15]:
def cosent_ibn_angle(embedding1, embedding2, labels, w_cosent=1, w_ibn=1, w_angle=1, tau_cosent=20.0, tau_ibn=20.0, tau_angle=1.0):
    return w_cosent * cosent_loss(embedding1, embedding2, labels, tau_cosent) + w_ibn * in_batch_negative_loss(embedding1, embedding2, labels, tau_ibn) + w_angle * angle_loss(embedding1, embedding2, labels, tau_angle)

## Loss List

In [None]:
losses = [
    {'loss_name': 'without_ft', 'loss_type': 'emb', 'loss_kwargs': {}},
    {'loss_name': 'cosine_similarity_mse_loss', 'loss_type': 'emb', 'loss_kwargs': {}},
    {'loss_name': 'cosine_similarity_mse_norm', 'loss_type': 'emb', 'loss_kwargs': {'norm': 'divided_by_maximum'}},
    {'loss_name': 'cosent_loss', 'loss_type': 'emb', 'loss_kwargs': {'tau': 20.0}},
    {'loss_name': 'in_batch_negative_loss', 'loss_type': 'emb', 'loss_kwargs': {'tau': 20.0}},
    {'loss_name': 'angle_loss', 'loss_type': 'emb', 'loss_kwargs': {'tau': 1.0}},
    {'loss_name': 'cosent_ibn_angle', 'loss_type': 'emb', 'loss_kwargs': {'w_cosent': 1, 'w_ibn': 1, 'w_angle': 1, 'tau_cosent': 20.0, 'tau_ibn': 20.0, 'tau_angle': 1.0}}
]

# Training

### Training Preparation

#### Device Setting

In [18]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#### Dataset Preparation

In [20]:
def prepare_dataset(dataset):
    # Dataset Import...
    ds = get_sts_dataset(dataset)

    # Random Split...
    train_test_split = ds.train_test_split(test_size=0.30)
    train_dataset = train_test_split['train']
    test_dataset = train_test_split['test']
    return train_dataset, test_dataset

#### Model and Tokenizer Preparation

In [21]:
def get_model_tokenizer(model_id):
    model = AutoModel.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model.to(device)
    return model, tokenizer

#### Embedding Extraction

In [22]:
def extract_embeddings(model, tokenizer, device, sentences, to_numpy=False):
    encodings = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True).to(device)
    # [CLS] token embedding...
    embeddings = model(**encodings).last_hidden_state[:, 0, :]

    if to_numpy:
        embeddings = embeddings.cpu().detach().numpy()
    return embeddings

#### Train Driver

In [23]:
def train(model, tokenizer, dataset, batch_size, epochs=10, loss_name='cosine_similarity_mse_loss', **loss_kwargs):
    # Optimizer setting...
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Training loop...
    num_epochs = epochs
    model.train()
    for epoch in range(num_epochs):
        # print(f"Epoch {epoch + 1}/{num_epochs}")
        data_loader = DataLoader(STSDataset(dataset['sentence1'], dataset['sentence2'], dataset['labels']), batch_size=batch_size, shuffle=True)

        for sentence1_texts, sentence2_texts, labels in tqdm(data_loader, desc="Training", leave=False):
            labels = labels.to(device)

            # [CLS] token embedding...
            sentence1_embeddings = extract_embeddings(model, tokenizer, device, sentence1_texts)
            sentence2_embeddings = extract_embeddings(model, tokenizer, device, sentence2_texts)

            # Embedding Loss...
            loss = globals()[loss_name](sentence1_embeddings, sentence2_embeddings, labels, **loss_kwargs)
            if loss == 0.0:
                continue

            # Backpropagation...
            loss.backward()

            # Updating weights...
            optimizer.step()
            optimizer.zero_grad()
    return model

#### Evaluation Driver

In [24]:
def calculate_cosine_similarity(embeddings_1, embeddings_2):
    cosine_similarity = F.cosine_similarity(embeddings_1, embeddings_2, dim=1)
    return cosine_similarity

In [25]:
def calculate_Spearman_rank_correlation_coefficient(scores, scores_actual):
    sc, _ = spearmanr(scores, scores_actual)
    return sc

In [26]:
def evaluate_sts(model, tokenizer, test_dataset, batch_size):
    model.eval()

    test_dataset = STSDataset(test_dataset['sentence1'], test_dataset['sentence2'], test_dataset['labels'])
    data_loader = DataLoader(test_dataset, batch_size=batch_size)
    all_embeddings1 = []
    all_embeddings2 = []
    all_labels = []

    with torch.no_grad():
        for sentences1, sentences2, labels in tqdm(data_loader, desc="Extracting embeddings", leave=False):
            embeddings1 = extract_embeddings(model, tokenizer, device, sentences1)
            embeddings2 = extract_embeddings(model, tokenizer, device, sentences2)
            all_embeddings1.append(embeddings1.cpu())
            all_embeddings2.append(embeddings2.cpu())
            all_labels.append(labels.cpu())

    data_embeddings1 = torch.cat(all_embeddings1)
    data_embeddings2 = torch.cat(all_embeddings2)
    data_labels = torch.cat(all_labels)
    data_labels_np = data_labels.numpy()

    cosine_similarities = calculate_cosine_similarity(data_embeddings1, data_embeddings2)
    spearman = calculate_Spearman_rank_correlation_coefficient(cosine_similarities, data_labels_np)
    return spearman

### Loop

In [27]:
total_runs = 3
batch_size = 60
spearman_list = []
for loss in losses:
    loss_name = loss['loss_name']
    loss_type = loss['loss_type']
    loss_kwargs = loss['loss_kwargs']

    for dataset in sts_datasets:
        print(f'Running: {loss_name} on {dataset}')
        total_spearman = 0.

        for loop_count in range(0, total_runs):
            # Dataset Preparation...
            train_dataset, test_dataset = get_sts_dataset(dataset)

            # Model Preparation...
            model, tokenizer = get_model_tokenizer(model_id)

            # Training Loop...
            if loss_name != 'without_ft':
                model = train(model, tokenizer, train_dataset, batch_size, epochs=5, loss_name=loss_name, **loss_kwargs)

            # Evaluation loop...
            spearman = evaluate_sts(model, tokenizer, test_dataset, batch_size)
            # print(f'Loop {loop_count} spearman - {spearman}')
            total_spearman += spearman
        spearman_list.append({'loss': loss_name, 'dataset': dataset, 'spearman': total_spearman / total_runs})

Running: without_ft on STS-B


Generating train split: 100%|██████████| 5749/5749 [00:00<00:00, 213172.91 examples/s]
Generating validation split: 100%|██████████| 1500/1500 [00:00<00:00, 292530.62 examples/s]
Generating test split: 100%|██████████| 1379/1379 [00:00<00:00, 353196.46 examples/s]
                                                                    

Running: without_ft on STS12


Generating train split: 100%|██████████| 2234/2234 [00:00<00:00, 481430.16 examples/s]
Generating test split: 100%|██████████| 3108/3108 [00:00<00:00, 672646.90 examples/s]
                                                                      

Running: without_ft on STS13


Generating test split: 100%|██████████| 1500/1500 [00:00<00:00, 393905.33 examples/s]
                                                                    

Running: without_ft on STS14


Generating test split: 100%|██████████| 3750/3750 [00:00<00:00, 723455.22 examples/s]
                                                                      

Running: without_ft on STS15


Generating test split: 100%|██████████| 3000/3000 [00:00<00:00, 630754.02 examples/s]
                                                                      

Running: without_ft on STS16


Generating test split: 100%|██████████| 1186/1186 [00:00<00:00, 298245.97 examples/s]
                                                                    

Running: without_ft on SICK-R


Generating test split: 100%|██████████| 9927/9927 [00:00<00:00, 1442018.97 examples/s]
                                                                      

Running: cosent_ibn_angle on STS-B


                                                                    

Running: cosent_ibn_angle on STS12


                                                                      

Running: cosent_ibn_angle on STS13


                                                                    

Running: cosent_ibn_angle on STS14


                                                                      

Running: cosent_ibn_angle on STS15


                                                                      

Running: cosent_ibn_angle on STS16


                                                                    

Running: cosent_ibn_angle on SICK-R


                                                                      

In [28]:
spearman_list

[{'loss': 'without_ft', 'dataset': 'STS-B', 'spearman': 0.26643627551205923},
 {'loss': 'without_ft', 'dataset': 'STS12', 'spearman': 0.2420585603055578},
 {'loss': 'without_ft', 'dataset': 'STS13', 'spearman': 0.3438723953590479},
 {'loss': 'without_ft', 'dataset': 'STS14', 'spearman': 0.19262198854266815},
 {'loss': 'without_ft', 'dataset': 'STS15', 'spearman': 0.36896620968589905},
 {'loss': 'without_ft', 'dataset': 'STS16', 'spearman': 0.40072446843573833},
 {'loss': 'without_ft', 'dataset': 'SICK-R', 'spearman': 0.4221451404383869},
 {'loss': 'cosent_ibn_angle',
  'dataset': 'STS-B',
  'spearman': 0.8088552574843441},
 {'loss': 'cosent_ibn_angle',
  'dataset': 'STS12',
  'spearman': 0.8095812061620951},
 {'loss': 'cosent_ibn_angle',
  'dataset': 'STS13',
  'spearman': 0.8428907284097938},
 {'loss': 'cosent_ibn_angle',
  'dataset': 'STS14',
  'spearman': 0.8231900385888695},
 {'loss': 'cosent_ibn_angle',
  'dataset': 'STS15',
  'spearman': 0.888469043734321},
 {'loss': 'cosent_ibn_