In [None]:
! pip install datasets ipywidgets transformers accelerate -U

## Imports

In [1]:
from typing import Dict, List, Optional, Union, Any, Tuple
from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, AutoModelForCausalLM
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
from scipy.stats import spearmanr
import numpy as np
from sklearn.model_selection import train_test_split
import gzip
import csv
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [2]:
torch.cuda.is_available()

True

## Tokenizer

The Tokenizer method, for now the maximum length is fixed to 512...

In [3]:
class CustomDataTokenizer:
    def __init__(self, tokenizer, is_classification=True, max_length = 512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_classification = is_classification

    def __call__(self, data: Dict) -> Dict:
        # For Classification Task: Only Text1 is required...
        text_columns = ['text1']
        if not self.is_classification:
            text_columns.append('text2')

        tokens_list = []
        for text_column in text_columns:
            # Tokenization happens here to get in the form which is accepted in the Objective Function...
            tokens_list.append(self.tokenizer(data[text_column], max_length=self.max_length, truncation=True))

        token = {}
        seperate_ids = []
        for i, t in enumerate(tokens_list):
            for key, val in t.items():
                if i == 0:
                    token[key] = val
                else:
                    token[key] += val
                if key == 'input_ids':
                    seperate_ids += [i] * len(val)

        token['labels'] = [int(data['label']) if 'label' in data else -1]
        token['seperate_ids'] = seperate_ids

        return token

## Losses

The loss functions:

y_true and y_pred must be zigzag style, such as [x[0][0], x[0][1], x[1][0], x[1][1], ...], where (x[0][0], x[0][1]) stands for a pair.

In [5]:
def categorical_crossentropy(y_true: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor:
    return -(F.log_softmax(y_pred, dim=1) * y_true).sum(dim=1)

def cosine_loss(y_true: torch.Tensor, y_pred: torch.Tensor, tau: float=20.0) -> torch.Tensor:
    y_true = y_true[::2, 0]
    y_true = (y_true[:, None] < y_true[None, :]).float()
    y_pred = F.normalize(y_pred, p=2, dim=1)
    y_pred = torch.sum(y_pred[::2] * y_pred[1::2], dim=1) * tau
    y_pred = y_pred[:, None] - y_pred[None, :]
    y_pred = (y_pred - (1 - y_true) * 1e12).view(-1)
    zero = torch.Tensor([0]).to(y_pred.device)
    y_pred = torch.concat((zero, y_pred), dim=0)
    return torch.logsumexp(y_pred, dim=0)

def angle_loss(y_true: torch.Tensor, y_pred: torch.Tensor, tau: float=1.0):
    y_true = y_true[::2, 0]
    y_true = (y_true[:, None] < y_true[None, :]).float()

    y_pred_re, y_pred_im = torch.chunk(y_pred, 2, dim=1)
    a = y_pred_re[::2]
    b = y_pred_im[::2]
    c = y_pred_re[1::2]
    d = y_pred_im[1::2]

    z = torch.sum(c**2 + d**2, dim=1, keepdim=True)
    re = (a * c + b * d) / z
    im = (b * c - a * d) / z

    dz = torch.sum(a**2 + b**2, dim=1, keepdim=True)**0.5
    dw = torch.sum(c**2 + d**2, dim=1, keepdim=True)**0.5
    re /= (dz / dw)
    im /= (dz / dw)

    y_pred = torch.concat((re, im), dim=1)
    y_pred = torch.abs(torch.sum(y_pred, dim=1)) * tau
    y_pred = y_pred[:, None] - y_pred[None, :]
    y_pred = (y_pred - (1 - y_true) * 1e12).view(-1)
    zero = torch.Tensor([0]).to(y_pred.device)
    y_pred = torch.concat((zero, y_pred), dim=0)
    return torch.logsumexp(y_pred, dim=0)

def in_batch_negative_loss(y_true: torch.Tensor,
                           y_pred: torch.Tensor,
                           tau: float=20.0,
                           negative_weights: float=0.0) -> torch.Tensor:
    device = y_true.device

    def make_target_matrix(y_true: torch.Tensor):
        idxs = torch.arange(0, y_pred.shape[0]).int().to(device)
        y_true = y_true.int()
        idxs_1 = idxs[None, :]
        idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]

        idxs_1 *= y_true.T
        idxs_1 += (y_true.T == 0).int() * -2

        idxs_2 *= y_true
        idxs_2 += (y_true == 0).int() * -1

        y_true = (idxs_1 == idxs_2).float()
        return y_true

    neg_mask = make_target_matrix(y_true == 0)

    y_true = make_target_matrix(y_true)

    y_pred = F.normalize(y_pred, dim=1, p=2)
    similarities = y_pred @ y_pred.T
    similarities = similarities - torch.eye(y_pred.shape[0]).to(device) * 1e12
    similarities = similarities * tau

    if negative_weights > 0:
        similarities += neg_mask * negative_weights

    return categorical_crossentropy(y_true, similarities).mean()

def contrastive_pairwise_comb_loss(y_true: torch.Tensor, y_pred: torch.Tensor, tau: float=1.0, margin=1) -> torch.Tensor:
    y_true_pairs = y_true[::2, 0]
    y_true_pairs = (y_true_pairs[:, None] < y_true_pairs[None, :]).float()
    y_pred = F.normalize(y_pred, p=2, dim=1)

    cosine_sim = torch.sum(y_pred[::2] * y_pred[1::2], dim=1) * tau

    # Contrastive Loss...
    pos_pairs = y_true_pairs * torch.pow(1 - cosine_sim, 2)
    neg_pairs = (1 - y_true_pairs) * torch.pow(torch.clamp(cosine_sim - margin, min=0.0), 2)
    contrastive_loss = torch.mean(pos_pairs + neg_pairs)

    # Pairwise Ranking Loss...
    y_pred_diff = cosine_sim[:, None] - cosine_sim[None, :]
    y_true_diff = y_true_pairs[:, None] - y_true_pairs[None, :]
    mask = (y_true_diff > 0).float()
    pairwise_ranking_loss = torch.mean(mask * F.relu(1 - y_pred_diff))

    combined_loss_value = contrastive_loss + pairwise_ranking_loss

    return combined_loss_value

### Combined Loss Function of Contrastive Loss and Pairwise Ranking Loss

The combined loss function aims to improve the performance of sentence embeddings by incorporating contrastive loss, pairwise ranking loss, and ensuring the embeddings are normalized. Here's a detailed explanation of each step in the loss function:

#### 1. Input Preparation

```python
y_true_pairs = y_true[::2, 0]
y_true_pairs = (y_true_pairs[:, None] < y_true_pairs[None, :]).float()
```

- **Purpose**: This step prepares the ground truth labels (`y_true`) for use in the loss calculations.
- **Details**:
  - The `y_true` tensor is expected to be in a zigzag style, where pairs are alternately positioned.
  - The `::2` indexing selects every other element, assuming pairs are positioned adjacently.
  - The `<` operator creates a matrix where each element indicates if the row's pair score is less than the column's pair score. This matrix helps in identifying the positive and negative pairs for ranking.

#### 2. Embedding Normalization

```python
y_pred = F.normalize(y_pred, p=2, dim=1)
```

- **Purpose**: Normalize the embeddings to ensure stable and meaningful cosine similarity calculations.
- **Details**:
  - Normalization ensures that the embeddings lie on the unit sphere, making the cosine similarity a reliable measure of similarity.
  - `p=2` specifies L2 normalization, and `dim=1` normalizes along the feature dimension.

#### 3. Cosine Similarity Calculation

```python
cosine_sim = torch.sum(y_pred[::2] * y_pred[1::2], dim=1) * tau
```

- **Purpose**: Calculate the cosine similarity between paired embeddings.
- **Details**:
  - `y_pred[::2]` and `y_pred[1::2]` select alternating embeddings, assuming they are paired.
  - The dot product of these pairs gives the cosine similarity, scaled by a factor of `tau` to control the sharpness of similarity scores.

#### 4. Contrastive Loss

```python
pos_pairs = y_true_pairs * torch.pow(1 - cosine_sim, 2)
neg_pairs = (1 - y_true_pairs) * torch.pow(torch.clamp(cosine_sim - margin, min=0.0), 2)
contrastive_loss = torch.mean(pos_pairs + neg_pairs)
```

- **Purpose**: Ensure that similar pairs are close and dissimilar pairs are separated by at least the margin.
- **Details**:
  - `pos_pairs` calculates the penalty for similar pairs based on how close their cosine similarity is to 1.
  - `neg_pairs` calculates the penalty for dissimilar pairs, ensuring their similarity is less than the margin.
  - `torch.clamp` ensures the penalty is only applied when the similarity is within the margin.
  - The mean of these penalties gives the overall contrastive loss.

#### 5. Pairwise Ranking Loss

```python
y_pred_diff = cosine_sim[:, None] - cosine_sim[None, :]
y_true_diff = y_true_pairs[:, None] - y_true_pairs[None, :]
mask = (y_true_diff > 0).float()
pairwise_ranking_loss = torch.mean(mask * F.relu(1 - y_pred_diff))
```

- **Purpose**: Directly optimize the rank order of pairs to improve the Spearman's rank correlation coefficient.
- **Details**:
  - `y_pred_diff` calculates the difference in similarity scores between all pairs.
  - `y_true_diff` calculates the difference in true scores to identify positive differences.
  - `mask` ensures that only positive differences (where the row's score is less than the column's score) are considered.
  - `F.relu(1 - y_pred_diff)` penalizes cases where the rank order is incorrect, ensuring a margin of 1.
  - The mean of these penalties gives the overall pairwise ranking loss.

#### 6. Combined Loss

```python
combined_loss_value = contrastive_loss + pairwise_ranking_loss
```

- **Purpose**: Combine the contrastive and pairwise ranking losses to create a robust training signal.
- **Details**:
  - The final loss value is a sum of the contrastive loss and pairwise ranking loss, balancing between separating embeddings and ensuring correct rank order.

### Comparison with SimCSE

SimCSE (Simple Contrastive Learning of Sentence Embeddings) focuses on using contrastive loss to improve sentence embeddings. Here are some key differences and advantages of the enhanced combined loss function:

1. **Pairwise Ranking Optimization**:
   - **Problem in SimCSE**: SimCSE uses a contrastive loss that encourages similar pairs to be close and dissimilar pairs to be far apart. However, it doesn't directly optimize for the rank order of pairs, which is crucial for tasks like STS where rank correlation is important.
   - **Solution in Enhanced Loss**: The pairwise ranking loss directly optimizes the rank order, improving the Spearman's rank correlation coefficient.

2. **Temperature Scaling (Tau)**:
   - **Problem in SimCSE**: The scaling of similarity scores might not be explicitly controlled, which can affect the sharpness and stability of similarity scores.
   - **Solution in Enhanced Loss**: The use of tau as a scaling factor ensures that the similarity scores are appropriately sharpened, aiding in better differentiation.

3. **Combined Loss Components**:
   - **Problem in SimCSE**: Relying solely on contrastive loss might not capture all the nuances required for rank-based evaluation metrics.
   - **Solution in Enhanced Loss**: Combining contrastive loss with pairwise ranking loss ensures both good separation of embeddings and correct rank order, providing a more robust learning signal.

This enhanced loss function addresses the limitations of SimCSE by integrating additional components that directly optimize the evaluation metric of interest, leading to potentially better performance in tasks requiring high rank correlation.

Combining the loss functions with weights

In [11]:
class TotalLoss:
    def __init__(self,
                w1: float = 1.0,
                w2: float = 1.0,
                w3: float = 1.0,
                w4: float = 1.0,
                cosine_tau: float = 20.0,
                ibn_tau: float = 20,
                angle_tau: float = 1.0,
                con_tau: float = 1.0):
        self.w1 = w1
        self.w2 = w2
        self.w3 = w3
        self.w4 = w4
        self.cosine_tau = cosine_tau
        self.ibn_tau = ibn_tau
        self.angle_tau = angle_tau
        self.con_tau = con_tau

    def __call__(self, labels: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        loss = 0.
        if self.w1 > 0:
            loss += self.w1 * cosine_loss(labels, outputs, self.cosine_tau)
        if self.w2 > 0:
            loss += self.w2 * in_batch_negative_loss(labels, outputs, self.ibn_tau)
        if self.w3 > 0:
            loss += self.w3 * angle_loss(labels, outputs, self.angle_tau)
        if self.w4 > 0:
            loss += self.w4 * contrastive_pairwise_comb_loss(labels, outputs, self.con_tau)
        return loss

## Pooler

The different Pooling methods, using CLS for now, and Padding Strategy 'Left' for now

In [8]:
class Pooler:
    def __init__(self,
                model,
                # ['cls', 'cls_avg', 'last', 'avg', 'max', 'all', 'specific token index']
                pooling_strategy: Optional[Union[int, str]] = 'cls',
                padding_strategy: Optional[str] = 'left'):
        self.model = model
        self.pooling_strategy = pooling_strategy
        self.padding_strategy = padding_strategy

    def __call__(self, inputs) -> Any:
        if self.pooling_strategy == 'last':
            batch_size = inputs['input_ids'].shape[0]
            if self.padding_strategy == 'left':
                sequence_lengths = -1
            else:
                sequence_lengths = inputs["attention_mask"].sum(dim=1) - 1

        outputs = self.model(**inputs).last_hidden_state
        if self.pooling_strategy == 'cls':
            outputs = outputs[:, 0]
        elif self.pooling_strategy == 'cls_avg':
            outputs = (outputs[:, 0] + torch.mean(outputs, dim=1)) / 2.0
        elif self.pooling_strategy == 'last':
            outputs = outputs[torch.arange(batch_size, device=outputs.device), sequence_lengths]
        elif self.pooling_strategy == 'avg':
            outputs = torch.sum(
                outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"])
        elif self.pooling_strategy == 'max':
            outputs, _ = torch.max(outputs * inputs["attention_mask"][:, :, None], dim=1)
        elif self.pooling_strategy == 'all':
            return outputs
        elif isinstance(self.pooling_strategy, int) or self.pooling_strategy.isnumeric():
            return outputs[:, int(self.pooling_strategy)]
        return outputs

## Trainer

The custom trainer method which extends the Trainer method of Transformers

In [9]:
class CustomTrainer(Trainer):
    def __init__(self, pooler: Pooler, loss_kwargs: Optional[Dict] = None, **kwargs):
        super().__init__(**kwargs)
        self.pooler = pooler
        if loss_kwargs is None:
            loss_kwargs = {}
        self.loss_fct = TotalLoss(**loss_kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels", None)
        outputs = self.pooler(inputs)
        loss = self.loss_fct(labels, outputs)
        return (loss, outputs) if return_outputs else loss

## Data Collator

The custom data collator which works with the trainer

In [10]:
class CustomDataCollator:
    tokenizer = None
    padding = 'longest'
    max_length: Optional[int] = 512
    return_tensors: str = "pt"

    def __init__(self, tokenizer_base):
        self.tokenizer = tokenizer_base

    def __call__(self, features: List[Dict], return_tensors: str = "pt") -> Dict[str, torch.Tensor]:
        if return_tensors is None:
            return_tensors = self.return_tensors
        has_token_type_ids = "token_type_ids" in features[0]

        new_features = []
        for feature in features:
            seperate_ids = feature['seperate_ids']
            input_ids = feature['input_ids']
            attention_mask = feature['attention_mask']
            assert len(seperate_ids) == len(input_ids) == len(attention_mask)

            has_token_type_ids = False
            if "token_type_ids" in feature:
                has_token_type_ids = True
                token_type_ids = feature['token_type_ids']
                assert len(token_type_ids) == len(input_ids)

            max_seperate_id = max(seperate_ids)
            prev_start_idx = 0
            for seperate_id in range(1, max_seperate_id + 1):
                start_idx = seperate_ids.index(seperate_id)

                new_feature = {}
                new_feature['input_ids'] = input_ids[prev_start_idx:start_idx]
                new_feature['attention_mask'] = attention_mask[prev_start_idx:start_idx]
                if has_token_type_ids:
                    new_feature['token_type_ids'] = token_type_ids[prev_start_idx:start_idx]
                new_feature['labels'] = feature['labels']
                new_features.append(new_feature)
                prev_start_idx = start_idx

            new_feature = {}
            new_feature['input_ids'] = input_ids[prev_start_idx:]
            new_feature['attention_mask'] = attention_mask[prev_start_idx:]
            if has_token_type_ids:
                new_feature['token_type_ids'] = token_type_ids[prev_start_idx:]
            new_feature['labels'] = feature['labels']
            new_features.append(new_feature)

        del features
        features = self.tokenizer.pad(
            {'input_ids': [feature['input_ids'] for feature in new_features]},
            padding=self.padding,
            max_length=self.max_length,
            return_tensors=return_tensors,
        )
        features['attention_mask'] = self.tokenizer.pad(
            {'input_ids': [feature['attention_mask'] for feature in new_features]},
            padding=self.padding,
            max_length=self.max_length,
            return_tensors=return_tensors,
        )['input_ids']
        if has_token_type_ids:
            features['token_type_ids'] = self.tokenizer.pad(
                {'input_ids': [feature['token_type_ids'] for feature in new_features]},
                padding=self.padding,
                max_length=self.max_length,
                return_tensors=return_tensors,
            )['input_ids']
        features['labels'] = torch.Tensor([feature['labels'] for feature in new_features])

        return features

## Fit

The fit method which starts the training process, for now a lot of arguments have provided with default value...

In [12]:
def fit(train_ds,
        model_base,
        tokenizer_base,
        batch_size: int = 32,
        output_dir: Optional[str] = 'chk/new_c',
        epochs: int = 3,
        learning_rate: float = 1e-5,
        warmup_steps: int = 1000,
        logging_steps: int = 10,
        eval_steps: Optional[int] = None,
        save_steps: int = 100,
        save_strategy: str = 'steps',
        save_total_limit: int = 10,
        gradient_accumulation_steps: int = 1,
        fp16: Optional[bool] = None,
        argument_kwargs: Optional[Dict] = None,
        trainer_kwargs: Optional[Dict] = None,
        loss_kwargs: Optional[Dict] = None):

    if argument_kwargs is None:
        argument_kwargs = {}
    if trainer_kwargs is None:
        trainer_kwargs = {}
    callbacks = None

    pooler = Pooler(model_base)

    trainer = CustomTrainer(
        pooler=pooler,
        model=model_base,
        train_dataset=train_ds,
        loss_kwargs=loss_kwargs,
        tokenizer=tokenizer_base,
        args=TrainingArguments(
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_steps=warmup_steps,
            num_train_epochs=epochs,
            learning_rate=learning_rate,
            fp16=fp16,
            logging_steps=logging_steps,
            save_strategy=save_strategy,
            eval_steps=eval_steps,
            save_steps=save_steps,
            output_dir=output_dir,
            save_total_limit=save_total_limit,
            load_best_model_at_end=False,
            ddp_find_unused_parameters=None,
            label_names=['labels', 'seperate_ids', 'extra'],
            **argument_kwargs,
        ),
        callbacks=callbacks,
        data_collator=CustomDataCollator(
            tokenizer_base
        ),
        **trainer_kwargs
    )

    trainer.train()
    return model_base, tokenizer_base, pooler

## Embeddings

The encode method to generate embeddings

In [13]:
def encode(inputs: Union[List[str], Tuple[str], List[Dict], str],
            model,
            pooler,
            tokenizer,
            max_length: Optional[int] = 512,
            to_numpy: bool = True,
            device: Optional[Any] = 'cuda:0'):
        if device is None:
            device = 'cpu'
        model.to(device)
        model.eval()

        tokens = tokenizer(
            inputs,
            padding='longest',
            max_length=max_length,
            truncation=True,
            return_tensors='pt')
        tokens.to(device)
        with torch.no_grad():
            output = pooler(tokens)
        if to_numpy:
            return output.float().detach().cpu().numpy()
        return output

# Execution

## Data Import

SentEval Datasets Import

In [14]:
def get_senteval_binary_data(dataset):
    match dataset:
        case 'CR':
            pos_file = open("../Data/custrev.pos", "r")
            neg_file = open("../Data/custrev.neg", "r")
        case 'MPQA':
            pos_file = open("../Data/mpqa.pos", "r")
            neg_file = open("../Data/mpqa.neg", "r")
        case 'MR':
            pos_file = open("../Data/rt-polarity.pos", "r")
            neg_file = open("../Data/rt-polarity.neg", "r")
        case 'SUBJ':
            pos_file = open("../Data/subj.objective", "r")
            neg_file = open("../Data/subj.subjective", "r")

    df_pos = pd.DataFrame()
    pos_content = pos_file.readlines()
    pos_file.close()
    df_pos['sentence'] = pos_content
    labels = np.ones(len(pos_content))
    df_pos['label'] = labels.astype('int')

    df_neg = pd.DataFrame()
    neg_content = neg_file.readlines()
    neg_file.close()
    df_neg['sentence'] = neg_content
    labels = np.zeros(len(neg_content))
    df_neg['label'] = labels.astype('int')

    df = pd.concat([df_pos, df_neg], axis=0, ignore_index=True)
    df['sentence'] = df['sentence'].str.replace('\n', '')
    return df

In [17]:
# SST -> We use the Binary classification...
def get_sst_data():
    train_file = open("../Data/sentiment-train", "r")
    train = train_file.readlines()
    train_file.close()

    sentence_train = []
    label_train = []
    for sentence in train:
        sentence = sentence.strip()
        label = int(sentence[len(sentence) - 1])
        sentence = sentence[:-1].strip()
        sentence_train.append(sentence)
        label_train.append(label)

    df_train = pd.DataFrame({'sentence': sentence_train, 'label': label_train})

    test_file = open("../Data/sentiment-test", "r")
    test = test_file.readlines()
    test_file.close()

    sentence_test = []
    label_test = []
    for sentence in test:
        sentence = sentence.strip()
        label = int(sentence[len(sentence) - 1])
        sentence = sentence[:-1].strip()
        sentence_test.append(sentence)
        label_test.append(label)

    df_test = pd.DataFrame({'sentence': sentence_test, 'label': label_test})
    df = pd.concat([df_train, df_test], axis=0, ignore_index=True)
    return df

In [18]:
def get_senteval_dataset(name):
    match name:
        case 'CR':
            return get_senteval_binary_data('CR')
        case 'MPQA':
            return get_senteval_binary_data('MPQA')
        case 'MR':
            return get_senteval_binary_data('MR')
        case 'SST':
            return get_sst_data()
        case 'SUBJ':
            return get_senteval_binary_data('SUBJ')

In [95]:
senteval_datasets = ['CR']#['CR', 'MPQA', 'MR', 'SUBJ']#, 'SST']

STS Datasets Import

In [20]:
def get_sts_dataset(dataset_name):
    match dataset_name:
        case 'STS-B':
            dataset = load_dataset('mteb/stsbenchmark-sts', split='test')
        case 'STS12':
            dataset = load_dataset('mteb/sts12-sts', split='test')
        case 'STS13':
            dataset = load_dataset('mteb/sts13-sts', split='test')
        case 'STS14':
            dataset = load_dataset('mteb/sts14-sts', split='test')
        case 'STS15':
            dataset = load_dataset('mteb/sts15-sts', split='test')
        case 'STS16':
            dataset = load_dataset('mteb/sts16-sts', split='test')
        case 'SICK-R':
            dataset = load_dataset('mteb/sickr-sts', split='test')
    return dataset

In [88]:
sts_datasets = ['STS12']#['STS-B', 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'SICK-R']

## Objective Function Combinations

Using the 3 provided objective functions and their 7 possible combinations for now

In [59]:
def get_objective_function_weights(combi):
    match combi:
        case 'Cosine': return (1, 0, 0, 0)
        case 'In-Batch Negatives': return (0, 1, 0, 0)
        case 'Angle': return (0, 0, 1, 0)
        case 'Contrastive-Pairwise': return (0, 0, 0, 1)
        case 'Cosine + In-Batch Negatives': return (1, 1, 0, 0)
        case 'Cosine + Angle': return (1, 0, 1, 0)
        case 'Cosine + Contrastive-Pairwise': return (1, 0, 0, 1)
        case 'In-Batch Negatives + Angle': return (0, 1, 1, 0)
        case 'In-Batch Negatives + Contrastive-Pairwise': return (0, 1, 0, 1)
        case 'Angle + Contrastive-Pairwise': return (0, 0, 1, 1)
        case 'Cosine + In-Batch Negatives + Angle': return (1, 1, 1, 0)
        case 'Cosine + In-Batch Negatives + Contrastive-Pairwise': return (1, 1, 0, 1)
        case 'Cosine + Angle + Contrastive-Pairwise': return (1, 0, 1, 1)
        case 'In-Batch Negatives + Angle + Contrastive-Pairwise': return (0, 1, 1, 1)
        case 'Cosine + In-Batch Negatives + Angle + Contrastive-Pairwise': return (1, 1, 1, 1)

In [86]:
objective_functions = [
    # 'Cosine',
    # 'In-Batch Negatives',
    # 'Angle',
    # 'Contrastive-Pairwise',
    # 'Cosine + In-Batch Negatives',
    # 'Cosine + Angle',
    # 'Cosine + Contrastive-Pairwise',
    # 'In-Batch Negatives + Angle',
    # 'In-Batch Negatives + Contrastive-Pairwise',
    # 'Angle + Contrastive-Pairwise',
    # 'Cosine + In-Batch Negatives + Angle',
    # 'Cosine + In-Batch Negatives + Contrastive-Pairwise',
    # 'Cosine + Angle + Contrastive-Pairwise',
    # 'In-Batch Negatives + Angle + Contrastive-Pairwise',
    'Cosine + In-Batch Negatives + Angle + Contrastive-Pairwise'
]

## Language Models

Base Model Selection

In [91]:
models = [
    # 'bert-base-uncased',
    # 'bert-base-cased',
    # 'bert-large-uncased',
    # 'bert-large-cased',
    # 'FacebookAI/roberta-base',
    # 'sentence-transformers/all-mpnet-base-v2',
    'princeton-nlp/sup-simcse-roberta-large',
    # 'xuanye/cosent-similarity-text2vec',
    # 'kornwtp/sup-consert-large'
]

## Driver Functions

### SentEval

In [75]:
def driver_senteval():
    results_matrix = []

    for model in models:
        results_obj_matrix = []
        for objective in objective_functions:
            # Objective Functions Preparation...
            w1_combi, w2_combi, w3_combi, w4_combi = get_objective_function_weights(objective)
            results_obj_ds_matrix = []

            for dataset in senteval_datasets:
                # Model Preparation...
                tokenizer_base = AutoTokenizer.from_pretrained(model)
                model_base = AutoModel.from_pretrained(model)

                # Dataset Preparation...
                df = get_senteval_dataset(dataset)
                ds = Dataset.from_pandas(df)
                ds = ds.rename_column('sentence', 'text1')
                if dataset == 'MR':
                    ds = concatenate_datasets([ds.select(range(0, 231)), ds.select(range(233, 7463))])

                split_ds = ds.train_test_split(test_size=0.3, seed=42)
                ds_train = split_ds['train']
                ds_test = split_ds['test']

                # Tokenization...
                train_ds = ds_train.shuffle().map(CustomDataTokenizer(tokenizer_base), num_proc=8)

                # Model Training...
                model_new, tokenizer_new, pooler_new = fit(
                    train_ds=train_ds,
                    model_base=model_base,
                    tokenizer_base=tokenizer_base,
                    output_dir='chk/c',
                    batch_size=32,
                    epochs=5,
                    learning_rate=2e-5,
                    save_steps=0,
                    eval_steps=100,
                    warmup_steps=0,
                    gradient_accumulation_steps=1,
                    loss_kwargs={
                        'w1': w1_combi,
                        'w2': w2_combi,
                        'w3': w3_combi,
                        'w4': w4_combi,
                        'cosine_tau': 20,
                        'ibn_tau': 20,
                        'angle_tau': 1.0,
                        'con_tau': 1.2
                    },
                    fp16=True,
                    logging_steps=1000
                )

                # Embedding Generation for Train and Test sets... Doing line-by-line embeddings for now...
                emb_train = []
                for sentence in ds_train['text1']:
                    emb_train.append(encode(sentence, model_new, pooler_new, tokenizer_new)[0])

                emb_test = []
                for sentence in ds_test['text1']:
                    emb_test.append(encode(sentence, model_new, pooler_new, tokenizer_new)[0])

                # Conversion into Numpy Array...
                emb_train = np.array(emb_train)
                emb_test = np.array(emb_test)

                # Classification...
                lr = LogisticRegression(max_iter=10000)
                lr.fit(emb_train, ds_train['label'])
                accuracy_score = lr.score(emb_test, ds_test['label'])
                
                base_dir = '../Files/STS_Results/'
                file_name = 'STS_' + dataset + '_' + model[0].replace('/', '-') + '.npy'
                with open(base_dir + file_name, 'wb') as f:
                    np.save(f, accuracy_score)
                
                results_obj_ds_matrix.append(accuracy_score)
            results_obj_matrix.append(results_obj_ds_matrix)
        results_matrix.append(results_obj_matrix)
    return results_matrix

### STS

In [27]:
def calculate_cosine_similarity(sentence1_vec, sentence2_vec):
    cosine_similarity = np.dot(sentence1_vec, sentence2_vec) / (np.linalg.norm(sentence1_vec) * np.linalg.norm(sentence2_vec))
    return cosine_similarity

In [28]:
def calculate_Spearman_rank_correlation_coefficient(scores, scores_actual):
    sc, _ = spearmanr(scores, scores_actual)
    return sc

In [74]:
def driver_sts():
    results_matrix = []

    for model in models:
        results_obj_matrix = []
        for objective in objective_functions:
            # Objective Functions Preparation...
            w1_combi, w2_combi, w3_combi, w4_combi = get_objective_function_weights(objective)
            results_obj_ds_matrix = []

            for dataset in sts_datasets:
                # Model Preparation...
                tokenizer_base = AutoTokenizer.from_pretrained(model)
                model_base = AutoModel.from_pretrained(model)

                # Dataset Preparation...
                ds = get_sts_dataset(dataset)
                ds = ds.rename_column('sentence1', 'text1')
                ds = ds.rename_column('sentence2', 'text2')
                ds = ds.rename_column('score', 'label')

                split_ds = ds.train_test_split(test_size=0.3, seed=42)
                ds_train = split_ds['train']
                ds_test = split_ds['test']

                # Tokenization of train dataset...
                train_ds = ds_train.shuffle().map(CustomDataTokenizer(tokenizer_base, is_classification=False), num_proc=8)

                # Model Training...
                model_new, tokenizer_new, pooler_new = fit(
                    train_ds=train_ds,
                    model_base=model_base,
                    tokenizer_base=tokenizer_base,
                    output_dir='chk/c',
                    batch_size=32,
                    epochs=5,
                    learning_rate=2e-5,
                    save_steps=0,
                    eval_steps=100,
                    warmup_steps=0,
                    gradient_accumulation_steps=1,
                    loss_kwargs={
                        'w1': w1_combi,
                        'w2': w2_combi,
                        'w3': w3_combi,
                        'w4': w4_combi,
                        'cosine_tau': 20,
                        'ibn_tau': 20,
                        'angle_tau': 1.0,
                        'con_tau': 1.2
                    },
                    fp16=True,
                    logging_steps=1000
                )

                # Generating embeddings of STS dataset using the newly trained model...
                emb_sentence_1 = encode(ds_test['text1'], model_new, pooler_new, tokenizer_new) # generating embeddings for test set sentence 1
                emb_sentence_2 = encode(ds_test['text2'], model_new, pooler_new, tokenizer_new) # generating embeddings for test set sentence 2

                # Calculating Spearman for AnglE...
                cos_score = []
                for i in range(emb_sentence_1.shape[0]):
                    cos_score.append(calculate_cosine_similarity(emb_sentence_1[i], emb_sentence_2[i]))

                spearman = calculate_Spearman_rank_correlation_coefficient(cos_score, ds_test['label'])
                
                base_dir = '../Files/STS_Results/'
                file_name = 'STS_' + dataset + '_' + model[0].replace('/', '-') + '.npy'
                with open(base_dir + file_name, 'wb') as f:
                    np.save(f, spearman)
                
                results_obj_ds_matrix.append(spearman)
            results_obj_matrix.append(results_obj_ds_matrix)
        results_matrix.append(results_obj_matrix)
    return results_matrix

### Running

In [92]:
results_matrix_sts = driver_sts()

Map (num_proc=8):   0%|          | 0/2175 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [93]:
results_matrix_sts

[[[0.8604430074444147]]]

In [96]:
results_matrix_senteval = driver_senteval()

Map (num_proc=8):   0%|          | 0/2642 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [97]:
results_matrix_senteval

[[[0.9143865842894969]]]

### Saving the Results

In [None]:
with open('../Results/BERT_SentEval_Results_New_Loss.npy', 'wb') as f:
    np.save(f, results_matrix_senteval)

In [None]:
with open('../Results/BERT_STS_Results_New_Loss.npy', 'wb') as f:
    np.save(f, results_matrix_sts)

# NLI

In [None]:
def load_all_nli(exclude_neutral=True):
    label_mapping = {
        'entailment': 1,  # '0' (entailment)
        'neutral': 1,
        'contradiction': 0   # '2' (contradiction)
    }
    data = []
    with gzip.open('AllNLI.tsv.gz', 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            if row['split'] == 'train' and row['label'] != 'neutral':
                if exclude_neutral and row['label'] == 'neutral':
                    continue
                sent1 = row['sentence1'].strip()
                sent2 = row['sentence2'].strip()
                data.append({'text1': sent1, 'text2': sent2, 'label': label_mapping[row['label']]})
    return data

In [None]:
def preprocess_nli():
    train_data = load_all_nli()
    nli_dataset = {}
    train_ds = Dataset.from_list(train_data)
    nli_dataset['train'] = train_ds
    nli_dataset = DatasetDict(nli_dataset)
    ds_train = nli_dataset['train']
    return ds_train

Training with AnglE losses...

In [None]:
def train_nli(ds_train, model_base, tokenizer_base):
    train_ds = ds_train.shuffle().map(CustomDataTokenizer(tokenizer_base), num_proc=8)
    model_new, tokenizer_new, pooler_new = fit(
        train_ds=train_ds,
        model_base=model_base,
        tokenizer_base=tokenizer_base,
        output_dir='chk/c',
        batch_size=32,
        epochs=5,
        learning_rate=2e-5,
        save_steps=0,
        eval_steps=100,
        warmup_steps=0,
        gradient_accumulation_steps=1,
        loss_kwargs={
            'w1': 1.0,
            'w2': 1.0,
            'w3': 1.0
            'cosine_tau': 20,
            'ibn_tau': 20,
            'angle_tau': 1.0
        },
        fp16=True,
        logging_steps=1000
    )
    return model_new, tokenizer_new, pooler_new

### Mathematical Comparison: Enhanced Combined Loss vs. SimCSE

#### SimCSE Loss Function

SimCSE uses a contrastive loss to learn sentence embeddings. The mathematical formulation of the contrastive loss in SimCSE is as follows:

**Contrastive Loss (SimCSE)**:
\[
\mathcal{L}_{\text{SimCSE}} = - \log \frac{\exp(\text{sim}(\mathbf{h}_i, \mathbf{h}_i^+))}{\sum_{j=1}^{N} \exp(\text{sim}(\mathbf{h}_i, \mathbf{h}_j))}
\]

- \(\mathbf{h}_i\): Embedding of the anchor sentence.
- \(\mathbf{h}_i^+\): Embedding of the positive pair (similar sentence).
- \(\mathbf{h}_j\): Embeddings of all sentences in the batch (including negatives).
- \(\text{sim}(\mathbf{u}, \mathbf{v})\): Cosine similarity between embeddings \(\mathbf{u}\) and \(\mathbf{v}\).

This loss encourages the cosine similarity between the anchor and positive pair to be high while pushing the similarities between the anchor and all other (negative) samples to be low.

#### Enhanced Combined Loss Function

The enhanced combined loss function incorporates contrastive loss, pairwise ranking loss, and normalization. Here’s the detailed mathematical formulation:

**1. Contrastive Loss**:

\[
\mathcal{L}_{\text{contrastive}} = \frac{1}{N} \sum_{i=1}^{N} \left[ y_i \cdot (1 - \text{sim}(\mathbf{h}_i, \mathbf{h}_i^+))^2 + (1 - y_i) \cdot \max(0, \text{sim}(\mathbf{h}_i, \mathbf{h}_i^+) - \text{margin})^2 \right]
\]

- \(y_i\): Binary label indicating whether \(\mathbf{h}_i\) and \(\mathbf{h}_i^+\) are similar (1) or dissimilar (0).
- \(\text{margin}\): Margin parameter ensuring dissimilar pairs have at least this distance in similarity.

**2. Pairwise Ranking Loss**:

\[
\mathcal{L}_{\text{ranking}} = \frac{1}{N} \sum_{i,j} \left[ \max(0, 1 - (\text{sim}(\mathbf{h}_i, \mathbf{h}_i^+) - \text{sim}(\mathbf{h}_j, \mathbf{h}_j^-))) \cdot \mathbb{I}(y_i > y_j) \right]
\]

- \(\mathbb{I}(y_i > y_j)\): Indicator function that is 1 if \(y_i > y_j\) (indicating the rank order is correct), and 0 otherwise.
- \(\text{sim}(\mathbf{h}_i, \mathbf{h}_i^+)\): Similarity score for positive pair.
- \(\text{sim}(\mathbf{h}_j, \mathbf{h}_j^-)\): Similarity score for negative pair.

**3. Combined Loss**:

The final combined loss function integrates the contrastive and pairwise ranking losses:

\[
\mathcal{L}_{\text{combined}} = \mathcal{L}_{\text{contrastive}} + \mathcal{L}_{\text{ranking}}
\]

**4. Normalization and Scaling with Tau**:

Normalization ensures embeddings are on the unit sphere, making cosine similarity a reliable measure. The scaling factor \(\tau\) controls the sharpness of the similarities:

\[
\mathbf{h}_i = \frac{\mathbf{h}_i}{\|\mathbf{h}_i\|_2}
\]
\[
\text{sim}(\mathbf{h}_i, \mathbf{h}_j) = \frac{\mathbf{h}_i \cdot \mathbf{h}_j}{\tau}
\]

### Comparison

#### Objective

- **SimCSE**: Focuses on maximizing similarity between positive pairs and minimizing similarity between negative pairs.
- **Enhanced Combined**: In addition to maximizing/minimizing similarities, it directly optimizes the rank order of pairs.

#### Equation Comparison

- **SimCSE**:
  \[
  \mathcal{L}_{\text{SimCSE}} = - \log \frac{\exp(\text{sim}(\mathbf{h}_i, \mathbf{h}_i^+))}{\sum_{j=1}^{N} \exp(\text{sim}(\mathbf{h}_i, \mathbf{h}_j))}
  \]

- **Enhanced Combined**:
  \[
  \mathcal{L}_{\text{combined}} = \frac{1}{N} \sum_{i=1}^{N} \left[ y_i \cdot (1 - \text{sim}(\mathbf{h}_i, \mathbf{h}_i^+))^2 + (1 - y_i) \cdot \max(0, \text{sim}(\mathbf{h}_i, \mathbf{h}_i^+) - \text{margin})^2 \right] + \frac{1}{N} \sum_{i,j} \left[ \max(0, 1 - (\text{sim}(\mathbf{h}_i, \mathbf{h}_i^+) - \text{sim}(\mathbf{h}_j, \mathbf{h}_j^-))) \cdot \mathbb{I}(y_i > y_j) \right]
  \]

#### Optimized Metric

- **SimCSE**: Focuses on contrastive separation, indirectly improving metrics like cosine similarity.
- **Enhanced Combined**: Directly optimizes rank order through pairwise ranking loss, targeting improvements in rank correlation metrics such as Spearman’s rank correlation coefficient.

#### Normalization and Scaling

- **SimCSE**: Uses normalization to ensure meaningful cosine similarities.
- **Enhanced Combined**: Uses normalization and scaling factor \(\tau\) to sharpen similarity scores, aiding in differentiation.

### Conclusion

The enhanced combined loss function addresses a specific limitation in SimCSE by directly optimizing the rank order of similarity scores through the pairwise ranking loss. This makes it more suitable for tasks like STS (Semantic Textual Similarity), where the goal is to achieve high rank correlation, particularly improving metrics like the Spearman’s rank correlation coefficient. The combined approach ensures both good separability of embeddings and correct rank order, providing a more robust learning signal compared to using contrastive loss alone.

### Difference Between Pairwise Ranking Loss and Contrastive Loss

#### Contrastive Loss

Contrastive loss is a distance-based loss function used to learn embeddings such that similar instances are close to each other in the embedding space while dissimilar instances are far apart. It is typically used in tasks where the model needs to distinguish between similar and dissimilar pairs.

**Mathematical Formulation:**

For a given pair of embeddings \((\mathbf{h}_i, \mathbf{h}_i^+)\) (anchor and positive) and a margin \(\text{margin}\):

\[
\mathcal{L}_{\text{contrastive}} = \frac{1}{N} \sum_{i=1}^{N} \left[ y_i \cdot (1 - \text{sim}(\mathbf{h}_i, \mathbf{h}_i^+))^2 + (1 - y_i) \cdot \max(0, \text{sim}(\mathbf{h}_i, \mathbf{h}_i^+) - \text{margin})^2 \right]
\]

- \(y_i\): Binary label indicating whether \(\mathbf{h}_i\) and \(\mathbf{h}_i^+\) are similar (1) or dissimilar (0).
- \(\text{sim}(\mathbf{u}, \mathbf{v})\): Cosine similarity between embeddings \(\mathbf{u}\) and \(\mathbf{v}\).
- The loss encourages similar pairs to have high similarity (close to 1) and dissimilar pairs to have similarity less than the margin.

#### Pairwise Ranking Loss

Pairwise ranking loss is designed to optimize the ranking order of pairs, which is particularly useful for tasks like information retrieval or ranking, where the relative order of predictions is more important than their absolute values. This loss function penalizes incorrect rank orders directly.

**Mathematical Formulation:**

For pairs of similarity scores \((\text{sim}(\mathbf{h}_i, \mathbf{h}_i^+), \text{sim}(\mathbf{h}_j, \mathbf{h}_j^-))\) and the ground truth ordering \(y_i > y_j\):

\[
\mathcal{L}_{\text{ranking}} = \frac{1}{N} \sum_{i,j} \left[ \max(0, 1 - (\text{sim}(\mathbf{h}_i, \mathbf{h}_i^+) - \text{sim}(\mathbf{h}_j, \mathbf{h}_j^-))) \cdot \mathbb{I}(y_i > y_j) \right]
\]

- \(\mathbb{I}(y_i > y_j)\): Indicator function that is 1 if the rank order \(y_i > y_j\) is correct, and 0 otherwise.
- The loss penalizes incorrect rank orders, ensuring that the difference between correct and incorrect orders is at least 1.

### Key Differences

1. **Objective**:
   - **Contrastive Loss**: Focuses on minimizing the distance between similar pairs and maximizing the distance between dissimilar pairs based on a margin.
   - **Pairwise Ranking Loss**: Focuses on the relative ordering of pairs, directly optimizing the rank order based on similarity scores.

2. **Penalty Mechanism**:
   - **Contrastive Loss**: Penalizes pairs based on their similarity scores and a predefined margin.
   - **Pairwise Ranking Loss**: Penalizes the relative ranking of similarity scores, ensuring correct ordering by imposing a margin between correct and incorrect pairs.

3. **Application**:
   - **Contrastive Loss**: Often used in metric learning tasks where the goal is to learn a distance metric or embedding space (e.g., face verification).
   - **Pairwise Ranking Loss**: Commonly used in ranking tasks, such as information retrieval, where the order of items is crucial (e.g., search engine results).

4. **Impact on Embedding Space**:
   - **Contrastive Loss**: Ensures that similar items are clustered together and dissimilar items are pushed apart.
   - **Pairwise Ranking Loss**: Ensures that the order of similarity scores aligns with the ground truth ranking, potentially resulting in a more nuanced embedding space that respects relative similarities.

### Enhanced Combined Loss

Combining both losses leverages the strengths of each:

```python
import torch
import torch.nn.functional as F

def enhanced_combined_loss(y_true: torch.Tensor, y_pred: torch.Tensor, margin: float = 1.0, tau: float = 20.0) -> torch.Tensor:
    """
    Compute enhanced combined contrastive, cross-entropy, and pairwise ranking loss

    :param y_true: torch.Tensor, ground truth.
        The y_true must be zigzag style, such as [x[0][0], x[0][1], x[1][0], x[1][1], ...], where (x[0][0], x[0][1]) stands for a pair.
    :param y_pred: torch.Tensor, model output.
        The y_pred must be zigzag style, such as [o[0][0], o[0][1], o[1][0], o[1][1], ...], where (o[0][0], o[0][1]) stands for a pair.
    :param margin: float, margin factor, default 1.0
    :param tau: float, scale factor, default 20

    :return: torch.Tensor, loss value
    """  # NOQA
    
    # Ensure y_true is in the correct format
    y_true_pairs = y_true[::2, 0]
    y_true_pairs = (y_true_pairs[:, None] < y_true_pairs[None, :]).float()
    
    # Normalize predictions to compute cosine similarity
    y_pred = F.normalize(y_pred, p=2, dim=1)
    
    # Compute cosine similarity for pairs and scale by tau
    cosine_sim = torch.sum(y_pred[::2] * y_pred[1::2], dim=1) * tau
    
    # Contrastive Loss
    pos_pairs = y_true_pairs * torch.pow(1 - cosine_sim, 2)
    neg_pairs = (1 - y_true_pairs) * torch.pow(torch.clamp(cosine_sim - margin, min=0.0), 2)
    contrastive_loss = torch.mean(pos_pairs + neg_pairs)
    
    # Pairwise Ranking Loss
    # Compute differences in similarity scores for pairwise ranking
    y_pred_diff = cosine_sim[:, None] - cosine_sim[None, :]
    y_true_diff = y_true_pairs[:, None] - y_true_pairs[None, :]
    
    # Create a mask to consider only positive differences
    mask = (y_true_diff > 0).float()
    
    # Compute pairwise ranking loss
    pairwise_ranking_loss = torch.mean(mask * F.relu(1 - y_pred_diff))
    
    # Combined Loss
    combined_loss_value = contrastive_loss + pairwise_ranking_loss
    
    return combined_loss_value


### Difference Between Pairwise Ranking Loss and Contrastive Loss

#### Contrastive Loss

Contrastive loss is a distance-based loss function used to learn embeddings such that similar instances are close to each other in the embedding space while dissimilar instances are far apart. It is typically used in tasks where the model needs to distinguish between similar and dissimilar pairs.

**Mathematical Formulation:**

For a given pair of embeddings $(\mathbf{h}_i, \mathbf{h}_i^+)$ (anchor and positive) and a margin $\text{margin}$:

$$
\mathcal{L}_{\text{contrastive}} = \frac{1}{N} \sum_{i=1}^{N} \left[ y_i \cdot (1 - \text{sim}(\mathbf{h}_i, \mathbf{h}_i^+))^2 + (1 - y_i) \cdot \max(0, \text{sim}(\mathbf{h}_i, \mathbf{h}_i^+) - \text{margin})^2 \right]
$$

- $y_i$: Binary label indicating whether $\mathbf{h}_i$ and $\mathbf{h}_i^+$ are similar (1) or dissimilar (0).
- $\text{sim}(\mathbf{u}, \mathbf{v})$: Cosine similarity between embeddings $\mathbf{u}$ and $\mathbf{v}$.
- The loss encourages similar pairs to have high similarity (close to 1) and dissimilar pairs to have similarity less than the margin.

#### Pairwise Ranking Loss

Pairwise ranking loss is designed to optimize the ranking order of pairs, which is particularly useful for tasks like information retrieval or ranking, where the relative order of predictions is more important than their absolute values. This loss function penalizes incorrect rank orders directly.

**Mathematical Formulation:**

For pairs of similarity scores $(\text{sim}(\mathbf{h}_i, \mathbf{h}_i^+), \text{sim}(\mathbf{h}_j, \mathbf{h}_j^-))$ and the ground truth ordering $y_i > y_j$:

$$
\mathcal{L}_{\text{ranking}} = \frac{1}{N} \sum_{i,j} \left[ \max(0, 1 - (\text{sim}(\mathbf{h}_i, \mathbf{h}_i^+) - \text{sim}(\mathbf{h}_j, \mathbf{h}_j^-))) \cdot \mathbb{I}(y_i > y_j) \right]
$$

- $\mathbb{I}(y_i > y_j)$: Indicator function that is 1 if the rank order $y_i > y_j$ is correct, and 0 otherwise.
- The loss penalizes incorrect rank orders, ensuring that the difference between correct and incorrect orders is at least 1.

### Key Differences

1. **Objective**:
   - **Contrastive Loss**: Focuses on minimizing the distance between similar pairs and maximizing the distance between dissimilar pairs based on a margin.
   - **Pairwise Ranking Loss**: Focuses on the relative ordering of pairs, directly optimizing the rank order based on similarity scores.

2. **Penalty Mechanism**:
   - **Contrastive Loss**: Penalizes pairs based on their similarity scores and a predefined margin.
   - **Pairwise Ranking Loss**: Penalizes the relative ranking of similarity scores, ensuring correct ordering by imposing a margin between correct and incorrect pairs.

3. **Application**:
   - **Contrastive Loss**: Often used in metric learning tasks where the goal is to learn a distance metric or embedding space (e.g., face verification).
   - **Pairwise Ranking Loss**: Commonly used in ranking tasks, such as information retrieval, where the order of items is crucial (e.g., search engine results).

4. **Impact on Embedding Space**:
   - **Contrastive Loss**: Ensures that similar items are clustered together and dissimilar items are pushed apart.
   - **Pairwise Ranking Loss**: Ensures that the order of similarity scores aligns with the ground truth ranking, potentially resulting in a more nuanced embedding space that respects relative similarities.

### Enhanced Combined Loss

Combining both losses leverages the strengths of each:

```python
import torch
import torch.nn.functional as F

def enhanced_combined_loss(y_true: torch.Tensor, y_pred: torch.Tensor, margin: float = 1.0, tau: float = 20.0) -> torch.Tensor:
    """
    Compute enhanced combined contrastive, cross-entropy, and pairwise ranking loss

    :param y_true: torch.Tensor, ground truth.
        The y_true must be zigzag style, such as [x[0][0], x[0][1], x[1][0], x[1][1], ...], where (x[0][0], x[0][1]) stands for a pair.
    :param y_pred: torch.Tensor, model output.
        The y_pred must be zigzag style, such as [o[0][0], o[0][1], o[1][0], o[1][1], ...], where (o[0][0], o[0][1]) stands for a pair.
    :param margin: float, margin factor, default 1.0
    :param tau: float, scale factor, default 20

    :return: torch.Tensor, loss value
    """  # NOQA
    
    # Ensure y_true is in the correct format
    y_true_pairs = y_true[::2, 0]
    y_true_pairs = (y_true_pairs[:, None] < y_true_pairs[None, :]).float()
    
    # Normalize predictions to compute cosine similarity
    y_pred = F.normalize(y_pred, p=2, dim=1)
    
    # Compute cosine similarity for pairs and scale by tau
    cosine_sim = torch.sum(y_pred[::2] * y_pred[1::2], dim=1) * tau
    
    # Contrastive Loss
    pos_pairs = y_true_pairs * torch.pow(1 - cosine_sim, 2)
    neg_pairs = (1 - y_true_pairs) * torch.pow(torch.clamp(cosine_sim - margin, min=0.0), 2)
    contrastive_loss = torch.mean(pos_pairs + neg_pairs)
    
    # Pairwise Ranking Loss
    # Compute differences in similarity scores for pairwise ranking
    y_pred_diff = cosine_sim[:, None] - cosine_sim[None, :]
    y_true_diff = y_true_pairs[:, None] - y_true_pairs[None, :]
    
    # Create a mask to consider only positive differences
    mask = (y_true_diff > 0).float()
    
    # Compute pairwise ranking loss
    pairwise_ranking_loss = torch.mean(mask * F.relu(1 - y_pred_diff))
    
    # Combined Loss
    combined_loss_value = contrastive_loss + pairwise_ranking_loss
    
    return combined_loss_value