In [2]:
! pip install -q datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Import Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AdamW
import numpy as np
import pandas as pd
from datasets import load_dataset, concatenate_datasets, Dataset
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding, AdamW
from huggingface_hub import login
from peft import LoraConfig, get_peft_model, TaskType

# Datasets

In [2]:
def select_n_nli(nli_dataset, n):
    positive_samples = nli_dataset.filter(lambda example: example['labels'] == 1)
    negative_samples = nli_dataset.filter(lambda example: example['labels'] == 0)
    positive_samples = positive_samples.shuffle(seed=42).select(range(int(n / 2)))
    negative_samples = negative_samples.shuffle(seed=42).select(range(int(n / 2)))
    balanced_dataset = concatenate_datasets([positive_samples, negative_samples]).shuffle(seed=42)
    return balanced_dataset

In [3]:
def get_nli_dataset(n=10000, type='pair', exclude_neutral=True):
    if type == 'pair' or type == 'clf':
        dataset = load_dataset('sentence-transformers/all-nli', 'pair-class')

        # Mapping the labels in such a way so that contradiction is the least similar and entailment is the most similar...
        label_mapping = {
            0: 'contradiction',
            2: 'neutral',
            1: 'entailment'
        }

        def map_labels(example):
            example["label"] = label_mapping[example["label"]]
            return example

        dataset = dataset.map(map_labels)
        if exclude_neutral:
            dataset = dataset.filter(lambda example: example['label'] != 2)
        dataset = dataset.rename_column('label', 'labels')
        return dataset['train'] if n is None else select_n_nli(dataset['train'], n)
    elif type == 'triplet':
        dataset = load_dataset('sentence-transformers/all-nli', 'triplet')
        return dataset['train'] if n is None else dataset['train'].select(range(n))

In [4]:
class NLIPairDataset(torch.utils.data.Dataset):
    def __init__(self, premises, hypotheses, labels):
        self.premises = premises
        self.hypotheses = hypotheses
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.premises[idx], self.hypotheses[idx], self.labels[idx]

In [5]:
class NLITripletDataset(torch.utils.data.Dataset):
    def __init__(self, anchor, positive, negative):
        self.anchor = anchor
        self.positive = positive
        self.negative = negative

    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, idx):
        return self.anchor[idx], self.positive[idx], self.negative[idx]

In [37]:
def get_senteval_dataset(dataset_name):
    dataset = load_dataset(f'rahulsikder223/SentEval-{dataset_name}')
    concatenated_dataset = concatenate_datasets([dataset['train'], dataset['test']])
    concatenated_dataset = concatenated_dataset.rename_column("label", "labels")
    return concatenated_dataset

senteval_datasets = ['CR', 'MPQA', 'MR', 'SUBJ']

In [38]:
def get_sts_dataset(dataset_name):
    if dataset_name == 'STS-B':
        dataset_name = 'stsbenchmark'
    elif dataset_name == 'SICK-R':
        dataset_name = 'sickr'
    dataset = load_dataset(f'mteb/{dataset_name.lower()}-sts', split='test')
    dataset = dataset.rename_column('score', 'labels')
    return dataset

sts_datasets = ['STS-B', 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'SICK-R']

# Model

In [8]:
login(token="<your token here>")
model_id = 'meta-llama/Llama-3.2-1B'

# Loss Functions

## Classification Losses - Single sentence column and sentiment label

### Cross-Entropy Loss

In [9]:
def cross_entropy_loss(logits, labels):
    return nn.CrossEntropyLoss()(logits, labels)

### Label Smoothing Cross-Entropy Loss

In [10]:
def label_smoothing_cross_entropy_loss(logits, labels, smoothing=0.1):
    confidence = 1.0 - smoothing
    log_probs = F.log_softmax(logits, dim=-1)

    # Initializing true distribution with smoothing value for all classes...
    true_dist = torch.full_like(log_probs, smoothing / (log_probs.size(1) - 1))
    # Setting the true label confidence in the correct class...
    true_dist.scatter_(1, labels.unsqueeze(1), confidence)

    loss = torch.mean(torch.sum(-true_dist * log_probs, dim=-1))
    return loss

## Embedding Losses

### Triplets - Anchor, Positive and Negative sentences (3 sentence columns) and label

#### Triplet Loss

In [11]:
def triplet_loss(anchors, positives, negatives, margin=1.0):
    if anchors is None or positives is None or negatives is None:
        return 0.0

    # Euclidean distance between anchor and positive, and anchor and negative...
    pos_dist = F.pairwise_distance(anchors, positives)
    neg_dist = F.pairwise_distance(anchors, negatives)
    loss = torch.clamp(pos_dist - neg_dist + margin, min=0.0)
    return loss.mean()

#### Hard Triplet Loss

In [12]:
def compute_pairwise_distances(embeddings):
    dot_product = torch.matmul(embeddings, embeddings.t())
    square_norm = torch.diag(dot_product)
    distances = square_norm.unsqueeze(0) - 2.0 * dot_product + square_norm.unsqueeze(1)
    distances = torch.clamp(distances, min=0.0)
    return torch.sqrt(distances + 1e-16)

In [13]:
def hard_triplet_loss(anchor, positive, negative, margin=1.0):
    pos_distances = compute_pairwise_distances(anchor - positive)
    neg_distances = compute_pairwise_distances(anchor - negative)

    # Hard positives and negatives: closest positive and furthest negative...
    hardest_positive_dist, _ = pos_distances.max(dim=1, keepdim=True)
    hardest_negative_dist, _ = neg_distances.min(dim=1, keepdim=True)
    triplet_loss = torch.clamp(hardest_positive_dist - hardest_negative_dist + margin, min=0.0)
    return triplet_loss.mean()

### Pairs - 2 sentence columns and label

#### Cosine Similarity Mean Squared Error Loss

In [14]:
def cosine_similarity_mse_loss(embedding1, embedding2, labels):
    # Calculating the cosine similarity between the pairs of embeddings...
    cos_sim = F.cosine_similarity(embedding1, embedding2)

    # MSE loss...
    squared_difference = (labels - cos_sim) ** 2
    loss = squared_difference.mean()

    return loss

#### CoSENT Loss

In [15]:
def cosent_loss(embedding1, embedding2, labels, tau=20.0):
    # Input preparation...
    labels = (labels[:, None] < labels[None, :]).float()

    # Normalization of Logits...
    embedding1 = F.normalize(embedding1, p=2, dim=1)
    embedding2 = F.normalize(embedding2, p=2, dim=1)

    # Cosine Similarity Calculation...
    # The dot product of these pairs gives the cosine similarity, scaled by a factor of tau to control the sharpness of similarity scores...
    y_pred = torch.sum(embedding1 * embedding2, dim=1) * tau

    # Pairwise cosine similarity difference calculation...
    y_pred = y_pred[:, None] - y_pred[None, :]

    y_pred = (y_pred - (1 - labels) * 1e12).view(-1)

    zero = torch.Tensor([0]).to(y_pred.device)
    y_pred = torch.concat((zero, y_pred), dim=0)
    return torch.logsumexp(y_pred, dim=0)

#### In-Batch Negatives Loss

In [16]:
def categorical_crossentropy(y_true, y_pred):
    return -(F.log_softmax(y_pred, dim=1) * y_true).sum(dim=1)

def in_batch_negative_loss(embedding1, embedding2, labels, tau=20.0, negative_weights=0.0):
    device = labels.device

    y_pred = torch.empty((2 * embedding1.shape[0], embedding1.shape[1]), device=device)
    y_pred[0::2] = embedding1
    y_pred[1::2] = embedding2
    y_true = labels.repeat_interleave(2).unsqueeze(1)

    def make_target_matrix(y_true):
        idxs = torch.arange(0, y_pred.shape[0]).int().to(device)
        y_true = y_true.int()
        idxs_1 = idxs[None, :]
        idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]

        idxs_1 *= y_true.T
        idxs_1 += (y_true.T == 0).int() * -2

        idxs_2 *= y_true
        idxs_2 += (y_true == 0).int() * -1

        y_true = (idxs_1 == idxs_2).float()
        return y_true

    neg_mask = make_target_matrix(y_true == 0)

    y_true = make_target_matrix(y_true)

    y_pred = F.normalize(y_pred, dim=1, p=2)
    similarities = y_pred @ y_pred.T
    similarities = similarities - torch.eye(y_pred.shape[0]).to(device) * 1e12
    similarities = similarities * tau

    if negative_weights > 0:
        similarities += neg_mask * negative_weights

    return categorical_crossentropy(y_true, similarities).mean()

#### Angle Loss

In [17]:
def angle_loss(embedding1, embedding2, labels, tau=1.0):
    # Input preparation...
    labels = (labels[:, None] < labels[None, :]).float()

    # Chunking into real and imaginary parts...
    y_pred_re1, y_pred_im1 = torch.chunk(embedding1, 2, dim=1)
    y_pred_re2, y_pred_im2 = torch.chunk(embedding2, 2, dim=1)

    a = y_pred_re1
    b = y_pred_im1
    c = y_pred_re2
    d = y_pred_im2

    z = torch.sum(c**2 + d**2, dim=1, keepdim=True)
    re = (a * c + b * d) / z
    im = (b * c - a * d) / z

    dz = torch.sum(a**2 + b**2, dim=1, keepdim=True)**0.5
    dw = torch.sum(c**2 + d**2, dim=1, keepdim=True)**0.5
    re /= (dz / dw)
    im /= (dz / dw)

    y_pred = torch.concat((re, im), dim=1)
    y_pred = torch.abs(torch.sum(y_pred, dim=1)) * tau
    y_pred = y_pred[:, None] - y_pred[None, :]
    y_pred = (y_pred - (1 - labels) * 1e12).view(-1)
    zero = torch.Tensor([0]).to(y_pred.device)
    y_pred = torch.concat((zero, y_pred), dim=0)
    return torch.logsumexp(y_pred, dim=0)

#### Combination of CoSENT, In-Batch Negatives and Angle Losses

In [18]:
def cosent_ibn_angle(embedding1, embedding2, labels, w_cosent=1, w_ibn=1, w_angle=1, tau_cosent=20.0, tau_ibn=20.0, tau_angle=1.0):
    return w_cosent * cosent_loss(embedding1, embedding2, labels, tau_cosent) + w_ibn * in_batch_negative_loss(embedding1, embedding2, labels, tau_ibn) + w_angle * angle_loss(embedding1, embedding2, labels, tau_angle)

## Loss List

In [31]:
losses = [
    {'loss_name': 'cross_entropy_loss', 'loss_type': 'clf', 'loss_kwargs': {}},
    {'loss_name': 'label_smoothing_cross_entropy_loss', 'loss_type': 'clf', 'loss_kwargs': {'smoothing': 0.1}},
    {'loss_name': 'triplet_loss', 'loss_type': 'triplet', 'loss_kwargs': {'margin': 5}},
    {'loss_name': 'hard_triplet_loss', 'loss_type': 'triplet', 'loss_kwargs': {'margin': 5}},
    {'loss_name': 'cosine_similarity_mse_loss', 'loss_type': 'pair', 'loss_kwargs': {}},
    {'loss_name': 'cosent_loss', 'loss_type': 'pair', 'loss_kwargs': {'tau': 20.0}},
    {'loss_name': 'in_batch_negative_loss', 'loss_type': 'pair', 'loss_kwargs': {'tau': 20.0}},
    {'loss_name': 'angle_loss', 'loss_type': 'pair', 'loss_kwargs': {'tau': 1.0}},
    {'loss_name': 'cosent_ibn_angle', 'loss_type': 'pair', 'loss_kwargs': {'w_cosent': 1, 'w_ibn': 1, 'w_angle': 1, 'tau_cosent': 20.0, 'tau_ibn': 20.0, 'tau_angle': 1.0}}
]

# Training

### Training Preparation

#### Device Setting

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#### Model and Tokenizer Preparation

In [21]:
def get_model_tokenizer(model_id, type='clf', apply_lora=True, num_labels=2):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token  # Set the EOS token as the padding token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    if type == 'clf':
        model = AutoModelForSequenceClassification.from_pretrained(model_id, device_map='auto', num_labels=num_labels)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_id)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    model.to(device)

    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS if type == 'clf' else TaskType.FEATURE_EXTRACTION,
        r=88,
        lora_alpha=16,
        lora_dropout=0.1
    )

    if apply_lora:
        model = get_peft_model(model, lora_config)

        for name, param in model.named_parameters():
            if "lora" not in name:
                param.requires_grad = False  # Freeze non-LoRA layers
            else:
                param.requires_grad = True
        print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

    return model, tokenizer

#### Embedding Extraction

In [22]:
def extract_embeddings(model, tokenizer, device, sentences, to_numpy=False):
    encodings = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True).to(device)
    embeddings = model(output_hidden_states=True, return_dict=True, **encodings).hidden_states[-1].mean(dim=1)

    if to_numpy:
        embeddings = embeddings.cpu().detach().numpy()
    return embeddings

#### Train Driver

In [23]:
def train(model, tokenizer, dataset, batch_size, loss_type='clf', epochs=10, loss_name='cross_entropy_loss', **loss_kwargs):
    # Optimizer setting...
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Training loop...
    num_epochs = epochs
    model.train()
    for epoch in range(num_epochs):
        if loss_type == 'clf':
            data_loader = DataLoader(NLIPairDataset(dataset['premise'], dataset['hypothesis'], dataset['labels']), batch_size=batch_size, shuffle=True)

            for premise_texts, hypothesis_texts, labels in tqdm(data_loader, desc="Training", leave=False):
                inputs = tokenizer(premise_texts, hypothesis_texts, padding=True, truncation=True, return_tensors="pt").to(device)
                labels = labels.to(device)

                # Forward pass...
                outputs = model(**inputs)
                logits = outputs.logits
                loss = globals()[loss_name](logits, labels, **loss_kwargs)

                # Backpropagation...
                loss.backward()

                # Updating weights...
                optimizer.step()
                optimizer.zero_grad()
        elif loss_type == 'pair':
            data_loader = DataLoader(NLIPairDataset(dataset['premise'], dataset['hypothesis'], dataset['labels']), batch_size=batch_size, shuffle=True)

            for premise_texts, hypothesis_texts, labels in tqdm(data_loader, desc="Training", leave=False):
                labels = labels.to(device)

                # [CLS] token embedding...
                premise_embeddings = extract_embeddings(model, tokenizer, device, premise_texts)
                hypothesis_embeddings = extract_embeddings(model, tokenizer, device, hypothesis_texts)

                # Embedding Loss...
                loss = globals()[loss_name](premise_embeddings, hypothesis_embeddings, labels, **loss_kwargs)
                if loss == 0.0:
                    continue

                # Backpropagation...
                loss.backward()

                # Updating weights...
                optimizer.step()
                optimizer.zero_grad()
        elif loss_type == 'triplet':
            data_loader = DataLoader(NLITripletDataset(dataset['anchor'], dataset['positive'], dataset['negative']), batch_size=batch_size, shuffle=True)

            for anchor_texts, positive_texts, negative_texts in tqdm(data_loader, desc="Training", leave=False):
                # [CLS] token embedding...
                anchor_embeddings = extract_embeddings(model, tokenizer, device, anchor_texts)
                positive_embeddings = extract_embeddings(model, tokenizer, device, positive_texts)
                negative_embeddings = extract_embeddings(model, tokenizer, device, negative_texts)

                # Embedding Loss...
                loss = globals()[loss_name](anchor_embeddings, positive_embeddings, negative_embeddings, **loss_kwargs)
                if loss == 0.0:
                    continue

                # Backpropagation...
                loss.backward()

                # Updating weights...
                optimizer.step()
                optimizer.zero_grad()
    return model

#### Evaluation Drivers

In [24]:
def calculate_cosine_similarity(embeddings_1, embeddings_2):
    cosine_similarity = F.cosine_similarity(embeddings_1, embeddings_2, dim=1)
    return cosine_similarity

In [25]:
def calculate_Spearman_rank_correlation_coefficient(scores, scores_actual):
    sc, _ = spearmanr(scores, scores_actual)
    return sc

In [26]:
class STSDataset(torch.utils.data.Dataset):
    def __init__(self, sentence1, sentence2, label):
        self.sentence1 = sentence1
        self.sentence2 = sentence2
        self.label = label

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return self.sentence1[idx], self.sentence2[idx], self.label[idx]

def evaluate_sts(model, tokenizer, loss, batch_size):
    model.eval()
    spearman_list = []

    for dataset_name in sts_datasets:
        dataset = get_sts_dataset(dataset_name)
        dataset = STSDataset(dataset['sentence1'], dataset['sentence2'], dataset['labels'])
        data_loader = DataLoader(dataset, batch_size=batch_size)
        all_embeddings1 = []
        all_embeddings2 = []
        all_labels = []

        with torch.no_grad():
            for sentences1, sentences2, labels in tqdm(data_loader, desc="Extracting embeddings", leave=False):
                embeddings1 = extract_embeddings(model, tokenizer, device, sentences1)
                embeddings2 = extract_embeddings(model, tokenizer, device, sentences2)
                all_embeddings1.append(embeddings1.cpu())
                all_embeddings2.append(embeddings2.cpu())
                all_labels.append(labels.cpu())

        data_embeddings1 = torch.cat(all_embeddings1)
        data_embeddings2 = torch.cat(all_embeddings2)
        data_labels = torch.cat(all_labels)
        data_labels_np = data_labels.numpy()

        cosine_similarities = calculate_cosine_similarity(data_embeddings1, data_embeddings2)
        spearman = calculate_Spearman_rank_correlation_coefficient(cosine_similarities, data_labels_np)
        spearman_list.append({'loss': loss, 'dataset': dataset_name, 'spearman': spearman})
    return spearman_list

In [27]:
class SentEvalDataset(torch.utils.data.Dataset):
    def __init__(self, sentence, label):
        self.sentence = sentence
        self.label = label

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return self.sentence[idx], self.label[idx]

def evaluate_senteval(model, tokenizer, loss, batch_size):
    model.eval()
    accuracy_list = []

    for dataset_name in senteval_datasets:
        dataset = get_senteval_dataset(dataset_name)
        dataset = SentEvalDataset(dataset['sentence'], dataset['labels'])
        data_loader = DataLoader(dataset, batch_size=batch_size)
        all_embeddings = []
        all_labels = []

        with torch.no_grad():
            for sentences, labels in tqdm(data_loader, desc="Extracting embeddings", leave=False):
                embeddings = extract_embeddings(model, tokenizer, device, sentences)
                all_embeddings.append(embeddings.cpu())
                all_labels.append(labels.cpu())

        data_embeddings = torch.cat(all_embeddings)
        data_labels = torch.cat(all_labels)

        data_embeddings_np = data_embeddings.numpy()
        data_labels_np = data_labels.numpy()

        train_embeddings, test_embeddings, train_labels, test_labels = train_test_split(data_embeddings_np, data_labels_np, test_size=0.3)

        # Training a Logistic Regression classifier on the training embeddings...
        lr_clf = LogisticRegression(max_iter=10000)
        lr_clf.fit(train_embeddings, train_labels)

        # Predicting the labels for the test set...
        test_predictions = lr_clf.predict(test_embeddings)
        accuracy = accuracy_score(test_labels, test_predictions)
        accuracy_list.append({'loss': loss, 'dataset': dataset_name, 'accuracy': accuracy})
    return accuracy_list

### Loop

In [36]:
total_runs = 3
batch_size = 10
ds_length = None
senteval_results_list = []
sts_results_list = []
for loss in losses:
    loss_name = loss['loss_name']
    loss_type = loss['loss_type']
    loss_kwargs = loss['loss_kwargs']

    for loop_count in range(0, total_runs):
        # Dataset Preparation...
        dataset = get_nli_dataset(n=ds_length, type=loss_type)

        # Model Preparation...
        model, tokenizer = get_model_tokenizer(model_id, loss_type)

        # Training Loop...
        model = train(model, tokenizer, dataset, batch_size, loss_type, epochs=10, loss_name=loss_name, **loss_kwargs)

        # Evaluation loop...
        sts_results = evaluate_sts(model, tokenizer, loss_name, batch_size)
        senteval_results = evaluate_senteval(model, tokenizer, loss_name, batch_size)
    senteval_results_list.append(senteval_results)
    sts_results_list.append(sts_results)