This notebook trains Llama 2 model using LoRA on NLI dataset using different loss functions and tests their performance by calculating Spearman's Rank Correlation Coefficient on STS datasets and classification accuracy on SentEval datasets. **(In Progress)**

- In this demo, testing is done on the **STS-13** dataset using all loss functions **CoSENT**, **In-Batch Negatives** and **Angle**.

- The loss functions CoSENT, In-Batch Negatives and Angle are taken from <a href="https://github.com/SeanLee97/AnglE">AnglE</a>, and the Cosine Similarity Loss is modified from <a href="https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosinesimilarityloss">SBERT</a>.

In [None]:
! pip install -q datasets accelerate==0.21.0 peft==0.4.0 transformers==4.31.0 scipy

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


### Imports

In [None]:
import os
import re
import sys
import json
import copy
import random
from functools import partial
from typing import Any, Dict, Optional, List, Union, Tuple, Callable
from dataclasses import dataclass

import scipy
import scipy.stats
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM, AutoModel, AutoTokenizer,
    PreTrainedModel, Trainer, TrainingArguments,
    TrainerCallback, BitsAndBytesConfig
)
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy
from peft import (
    get_peft_model, LoraConfig, TaskType, PeftModel,
    prepare_model_for_kbit_training
)
from peft.tuners.lora import LoraLayer
import gzip
import csv

### Losses

In [None]:
def default_cosine_similarity_loss(y_true, y_pred, tau=1):
    # Normalizing y_true values to fall between 0 and 1...
    y_true = y_true / 5.0
    y_true = y_true[::2, 0]
    y_pred1 = y_pred[0::2]
    y_pred2 = y_pred[1::2]

    # Calculating the cosine similarity between the pairs of embeddings...
    cos_sim = F.cosine_similarity(y_pred1, y_pred2)

    # MSE loss...
    squared_difference = (y_true - cos_sim) ** 2
    loss = squared_difference.mean()

    return loss

def categorical_crossentropy(y_true: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor:
    return -(F.log_softmax(y_pred, dim=1) * y_true).sum(dim=1)

def cosine_loss(y_true: torch.Tensor, y_pred: torch.Tensor, tau: float = 20.0) -> torch.Tensor:
    y_true = y_true[::2, 0]
    y_true = (y_true[:, None] < y_true[None, :]).float()
    y_pred = F.normalize(y_pred, p=2, dim=1)
    y_pred = torch.sum(y_pred[::2] * y_pred[1::2], dim=1) * tau
    y_pred = y_pred[:, None] - y_pred[None, :]
    y_pred = (y_pred - (1 - y_true) * 1e12).view(-1)
    zero = torch.Tensor([0]).to(y_pred.device)
    y_pred = torch.concat((zero, y_pred), dim=0)
    return torch.logsumexp(y_pred, dim=0)

def angle_loss(y_true: torch.Tensor, y_pred: torch.Tensor, tau: float = 1.0):
    y_true = y_true[::2, 0]
    y_true = (y_true[:, None] < y_true[None, :]).float()

    y_pred_re, y_pred_im = torch.chunk(y_pred, 2, dim=1)
    a = y_pred_re[::2]
    b = y_pred_im[::2]
    c = y_pred_re[1::2]
    d = y_pred_im[1::2]

    z = torch.sum(c**2 + d**2, dim=1, keepdim=True)
    re = (a * c + b * d) / z
    im = (b * c - a * d) / z

    dz = torch.sum(a**2 + b**2, dim=1, keepdim=True)**0.5
    dw = torch.sum(c**2 + d**2, dim=1, keepdim=True)**0.5
    re /= (dz / dw)
    im /= (dz / dw)

    y_pred = torch.concat((re, im), dim=1)
    y_pred = torch.abs(torch.sum(y_pred, dim=1)) * tau
    y_pred = y_pred[:, None] - y_pred[None, :]
    y_pred = (y_pred - (1 - y_true) * 1e12).view(-1)
    zero = torch.Tensor([0]).to(y_pred.device)
    y_pred = torch.concat((zero, y_pred), dim=0)
    return torch.logsumexp(y_pred, dim=0)

def in_batch_negative_loss(y_true: torch.Tensor,
                           y_pred: torch.Tensor,
                           tau: float = 20.0,
                           negative_weights: float = 0.0) -> torch.Tensor:
    device = y_true.device

    def make_target_matrix(y_true: torch.Tensor):
        idxs = torch.arange(0, y_pred.shape[0]).int().to(device)
        y_true = y_true.int()
        idxs_1 = idxs[None, :]
        idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]

        idxs_1 *= y_true.T
        idxs_1 += (y_true.T == 0).int() * -2

        idxs_2 *= y_true
        idxs_2 += (y_true == 0).int() * -1

        y_true = (idxs_1 == idxs_2).float()
        return y_true

    neg_mask = make_target_matrix(y_true == 0)

    y_true = make_target_matrix(y_true)

    y_pred = F.normalize(y_pred, dim=1, p=2)
    similarities = y_pred @ y_pred.T
    similarities = similarities - torch.eye(y_pred.shape[0]).to(device) * 1e12
    similarities = similarities * tau

    if negative_weights > 0:
        similarities += neg_mask * negative_weights

    return categorical_crossentropy(y_true, similarities).mean()

In [None]:
class TotalLoss:
    def __init__(self,
                w1: float = 1.0,
                w2: float = 1.0,
                w3: float = 1.0,
                cosine_tau: float = 20.0,
                ibn_tau: float = 20.0,
                angle_tau: float = 1.0):
        self.w1 = w1
        self.w2 = w2
        self.w3 = w3
        self.cosine_tau = cosine_tau
        self.ibn_tau = ibn_tau
        self.angle_tau = angle_tau

    def __call__(self, labels: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        loss = 0.
        if self.w1 > 0:
            loss += self.w1 * cosine_loss(labels, outputs, self.cosine_tau)
        if self.w2 > 0:
            loss += self.w2 * in_batch_negative_loss(labels, outputs, self.ibn_tau)
        if self.w3 > 0:
            loss += self.w3 * angle_loss(labels, outputs, self.angle_tau)
        return loss

### Data Collator

In [None]:
class CustomDataCollator:
    tokenizer = None
    padding = 'longest'
    max_length = 512
    return_tensors = "pt"

    def __init__(self, tokenizer_base, max_length=1024):
        self.tokenizer = tokenizer_base
        self.max_length = max_length

    def __call__(self, features: List[Dict], return_tensors: str = "pt") -> Dict[str, torch.Tensor]:
        if return_tensors is None:
            return_tensors = self.return_tensors
        has_token_type_ids = "token_type_ids" in features[0]
        # end_with_eos = features[0]['extra']['end_with_eos']

        new_features = []
        duplicate_set = set()
        for feature in features:
            seperate_ids = feature['seperate_ids']
            input_ids = feature['input_ids']
            attention_mask = feature['attention_mask']
            assert len(seperate_ids) == len(input_ids) == len(attention_mask)

            has_token_type_ids = False
            if "token_type_ids" in feature:
                has_token_type_ids = True
                token_type_ids = feature['token_type_ids']
                assert len(token_type_ids) == len(input_ids)

            max_seperate_id = max(seperate_ids)
            prev_start_idx = 0
            current_features = []
            is_duplicate = False
            for seperate_id in range(1, max_seperate_id + 1):
                start_idx = seperate_ids.index(seperate_id)
                new_feature = {}
                new_input_ids = input_ids[prev_start_idx:start_idx]
                if tuple(new_input_ids) in duplicate_set:
                    is_duplicate = True
                duplicate_set.add(tuple(new_input_ids))
                new_feature['input_ids'] = new_input_ids
                new_feature['attention_mask'] = attention_mask[prev_start_idx:start_idx]
                if has_token_type_ids:
                    new_feature['token_type_ids'] = token_type_ids[prev_start_idx:start_idx]
                new_feature['labels'] = feature['labels']
                current_features.append(new_feature)
                prev_start_idx = start_idx

            new_feature = {}
            new_input_ids = input_ids[prev_start_idx:]
            if tuple(new_input_ids) in duplicate_set:
                is_duplicate = True
            duplicate_set.add(tuple(new_input_ids))
            new_feature['input_ids'] = new_input_ids
            new_feature['attention_mask'] = attention_mask[prev_start_idx:]
            if has_token_type_ids:
                new_feature['token_type_ids'] = token_type_ids[prev_start_idx:]
            new_feature['labels'] = feature['labels']
            current_features.append(new_feature)
            new_features += current_features

        del features

        features = self.tokenizer.pad(
            {'input_ids': [feature['input_ids'] for feature in new_features]},
            padding=self.padding,
            max_length=self.max_length,
            return_tensors=return_tensors,
        )
        features['attention_mask'] = self.tokenizer.pad(
            {'input_ids': [feature['attention_mask'] for feature in new_features]},
            padding=self.padding,
            max_length=self.max_length,
            return_tensors=return_tensors,
        )['input_ids']
        if has_token_type_ids:
            features['token_type_ids'] = self.tokenizer.pad(
                {'input_ids': [feature['token_type_ids'] for feature in new_features]},
                padding=self.padding,
                max_length=self.max_length,
                return_tensors=return_tensors,
            )['input_ids']

        # if end_with_eos:
        #     features = {}
        #     features['input_ids'] = [feature['input_ids'] + [self.tokenizer.eos_token_id] for feature in new_features]
        #     features = self.tokenizer.pad(
        #         features,
        #         padding=self.padding,
        #         return_attention_mask=True,
        #         return_tensors=return_tensors)
        features['labels'] = torch.Tensor([feature['labels'] for feature in new_features])

        return features

### Tokenizer

In [None]:
class CustomDataTokenizer:
    def __init__(self,
                 tokenizer: AutoTokenizer,
                 is_classification=True,
                 max_length: Optional[int] = 512,
                 end_with_eos: bool = False):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_classification = is_classification
        self.end_with_eos = end_with_eos
        self.model_max_length = 512

    def __call__(self, data: Dict, **kwargs) -> Dict:
        text_columns = ['text1']
        if not self.is_classification:
            text_columns.append('text2')

        toks = []
        for text_column in text_columns:
            toks.append(self.tokenizer(data[text_column], max_length=self.max_length, truncation=True))

        combined_tok = {}
        seperate_ids = []
        for idx, tok in enumerate(toks):
            for key, val in tok.items():
                if idx == 0:
                    combined_tok[key] = val
                else:
                    combined_tok[key] += val
                if key == 'input_ids':
                    seperate_ids += [idx] * len(val)

        combined_tok['labels'] = [int(data['label']) if 'label' in data else -1]
        combined_tok['seperate_ids'] = seperate_ids
        combined_tok['extra'] = {
            'end_with_eos': self.end_with_eos
        }
        return combined_tok

### Pooler

In [None]:
class Pooler:
    def __init__(self,
                model,
                padding_strategy: Optional[str] = 'left'):
        self.model = model
        self.padding_strategy = padding_strategy

    def __call__(self, inputs, layer_index=-1) -> Any:
        all_layer_outputs = self.model(output_hidden_states=True, return_dict=True, **inputs).hidden_states
        outputs = all_layer_outputs[layer_index]
        batch_size = inputs['input_ids'].shape[0]
        sequence_lengths = -1 if self.padding_strategy == 'left' else inputs["attention_mask"].sum(dim=1) - 1
        outputs = outputs[torch.arange(batch_size, device=outputs.device), sequence_lengths]
        return outputs

### Trainer

In [None]:
class CustomTrainer(Trainer):
    def __init__(self, pooler: Pooler, loss_kwargs: Optional[Dict] = None, **kwargs):
        super().__init__(**kwargs)
        self.pooler = pooler
        if loss_kwargs is None:
            loss_kwargs = {}
        self.loss_fct = TotalLoss(**loss_kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels", None)
        outputs = self.pooler(inputs)
        loss = self.loss_fct(labels, outputs)
        return (loss, outputs) if return_outputs else loss

### LoRA Config

In [None]:
lora_config_obj = {
    "lora_r": 88,
    "lora_alpha": 16,
    "lora_dropout": 0.1,
    "use_4bit": True,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "use_nested_quant": False,
    "output_dir": "./results",
    "num_train_epochs": 1,
    "fp16": False,
    "bf16": False,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": True,
    "max_grad_norm": 0.3,
    "learning_rate": 2e-5,
    "weight_decay": 0.001,
    "optim": "paged_adamw_32bit",
    "lr_scheduler_type": "cosine",
    "max_steps": -1,
    "warmup_ratio": 0.03,
    "group_by_length": True,
    "save_steps": 0,
    "logging_steps": 25
}

In [None]:
def get_peft_config(lora_config_obj):
    peft_config = LoraConfig(
        lora_alpha = lora_config_obj['lora_alpha'],
        lora_dropout = lora_config_obj['lora_dropout'],
        r = lora_config_obj['lora_r'],
        bias = "none",
        task_type = TaskType.FEATURE_EXTRACTION,
    )
    return peft_config

### Fit

In [None]:
def fit(train_ds,
        model_base,
        tokenizer_base,
        batch_size: int = 32,
        output_dir: Optional[str] = 'chk/new_c',
        epochs: int = 3,
        learning_rate: float = 1e-5,
        warmup_steps: int = 1000,
        logging_steps: int = 10,
        eval_steps: Optional[int] = None,
        save_steps: int = 100,
        save_strategy: str = 'steps',
        save_total_limit: int = 10,
        gradient_accumulation_steps: int = 1,
        fp16: Optional[bool] = None,
        argument_kwargs: Optional[Dict] = None,
        trainer_kwargs: Optional[Dict] = None,
        loss_kwargs: Optional[Dict] = None):

    if argument_kwargs is None:
        argument_kwargs = {}
    if trainer_kwargs is None:
        trainer_kwargs = {}
    callbacks = None

    pooler = Pooler(model_base)

    trainer = CustomTrainer(
        pooler=pooler,
        model=model_base,
        tokenizer=tokenizer_base,
        train_dataset=train_ds,
        eval_dataset=None,
        loss_kwargs=loss_kwargs,
        args=TrainingArguments(
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_steps=warmup_steps,
            num_train_epochs=epochs,
            learning_rate=learning_rate,
            fp16=fp16,
            logging_steps=logging_steps,
            save_strategy=save_strategy,
            eval_steps=eval_steps,
            save_steps=save_steps,
            output_dir=output_dir,
            save_total_limit=save_total_limit,
            load_best_model_at_end=False,
            ddp_find_unused_parameters=None,
            label_names=['labels', 'seperate_ids', 'extra'],
            **argument_kwargs,
        ),
        callbacks=callbacks,
        data_collator=CustomDataCollator(
            tokenizer_base,
            max_length=1024
        ),
        **trainer_kwargs
    )

    trainer.train()
    return model_base, tokenizer_base, pooler

# Driver

### Dataset

In [None]:
! echo "download AllNLI"
! wget https://sbert.net/datasets/AllNLI.tsv.gz

In [None]:
def load_all_nli(exclude_neutral=True):
    label_mapping = {
        'entailment': 1,  # '0' (entailment)
        'neutral': 1,
        'contradiction': 0   # '2' (contradiction)
    }
    data = []
    with gzip.open('AllNLI.tsv.gz', 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            if row['split'] == 'train' and row['label'] != 'neutral':
                if exclude_neutral and row['label'] == 'neutral':
                    continue
                sent1 = row['sentence1'].strip()
                sent2 = row['sentence2'].strip()
                data.append({'text1': sent1, 'text2': sent2, 'label': label_mapping[row['label']]})
    return data

In [None]:
def preprocess_nli():
    train_data = load_all_nli()
    nli_dataset = {}
    train_ds = Dataset.from_list(train_data)
    nli_dataset['train'] = train_ds
    nli_dataset = DatasetDict(nli_dataset)
    ds_train = nli_dataset['train']
    return ds_train

In [None]:
nli_dataset = preprocess_nli()

### Model and Tokenizer

In [None]:
tokenizer_base = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf", trust_remote_code=True)
tokenizer_base.pad_token = tokenizer_base.eos_token
tokenizer_base.padding_side = "left"
tokenizer_base.padding_value = 0
tokenizer_base.pad_token_id = 0

model_base = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-2-7b-hf", device_map="auto")
model_base.config.use_cache = False
model_base.config.pretraining_tp = 1



tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Applying LoRA...

In [None]:
model_base = get_peft_model(model_base, get_peft_config(lora_config_obj)) # Using LoRA...

### Train on NLI

In [None]:
def train_nli(model_base, tokenizer_base, nli_dataset, loss_combination=None):
    train_ds = nli_dataset.shuffle().map(CustomDataTokenizer(tokenizer_base, is_classification=False), num_proc=8)

    model_new, tokenizer_new, pooler = fit(
        train_ds=train_ds,
        model_base=model_base,
        tokenizer_base=tokenizer_base,
        output_dir='chk/c',
        batch_size=5,
        epochs=5,
        learning_rate=2e-5,
        save_steps=0,
        eval_steps=100,
        warmup_steps=0,
        gradient_accumulation_steps=1,
        loss_kwargs={
            'w1': 1,
            'w2': 1,
            'w3': 1,
            'cosine_tau': 20,
            'ibn_tau': 20,
            'angle_tau': 1.0
        },
        fp16=True,
        logging_steps=1000
    )

    return (model_new, tokenizer_new, pooler)

In [None]:
model_ft, tokenizer_ft, pooler_ft = train_nli(model_base, tokenizer_base, nli_dataset)

### Evaluate Spearman's Rank Correlation Coefficient

In [None]:
def calculate_cosine_similarity(sentence1_vec, sentence2_vec):
    cosine_similarity = np.dot(sentence1_vec, sentence2_vec) / (np.linalg.norm(sentence1_vec) * np.linalg.norm(sentence2_vec))
    return cosine_similarity

In [None]:
def calculate_Spearman_rank_correlation_coefficient(scores, scores_actual):
    sc, _ = scipy.stats.spearmanr(scores, scores_actual)
    return sc

In [None]:
def encode(inputs: Union[List[str], Tuple[str], List[Dict], str],
            model,
            pooler,
            tokenizer,
            max_length: Optional[int] = 1024,
            to_numpy: bool = True,
            device: Optional[Any] = 'cuda:0'):
        if device is None:
            device = 'cpu'
        model.to(device)
        model.eval()

        tok = tokenizer(
            inputs,
            padding='longest',
            max_length=max_length,
            truncation=True,
            return_tensors='pt')
        tok.to(device)
        with torch.no_grad():
            output = pooler(tok)
        if to_numpy:
            return output.float().detach().cpu().numpy()
        return output

In [None]:
def generate_embeddings(dataset, model, tokenizer, pooler, is_sts=False):
    emb_sentence_1 = []
    for sentence in dataset['text1']:
        emb_sentence_1.append(encode(sentence, model, pooler, tokenizer)[0])
    emb_sentence_1 = np.array(emb_sentence_1)

    if is_sts:
        emb_sentence_2 = []
        for sentence in dataset['text2']:
            emb_sentence_2.append(encode(sentence, model, pooler, tokenizer)[0])
        emb_sentence_2 = np.array(emb_sentence_2)

    return (emb_sentence_1, emb_sentence_2) if is_sts else emb_sentence_1

In [None]:
def test_sts(dataset, model, tokenizer, pooler):
    emb_sentence_1, emb_sentence_2 = generate_embeddings(dataset, model, tokenizer, pooler, is_sts=True)

    cos_score = []
    for i in range(emb_sentence_1.shape[0]):
        cos_score.append(calculate_cosine_similarity(emb_sentence_1[i], emb_sentence_2[i]))

    spearman = calculate_Spearman_rank_correlation_coefficient(cos_score, dataset['label'])
    return spearman

In [None]:
def get_sts_dataset(dataset_name):
    match dataset_name:
        case 'STS-B':
            dataset = load_dataset('mteb/stsbenchmark-sts', split='test')
        case 'STS12':
            dataset = load_dataset('mteb/sts12-sts', split='test')
        case 'STS13':
            dataset = load_dataset('mteb/sts13-sts', split='test')
        case 'STS14':
            dataset = load_dataset('mteb/sts14-sts', split='test')
        case 'STS15':
            dataset = load_dataset('mteb/sts15-sts', split='test')
        case 'STS16':
            dataset = load_dataset('mteb/sts16-sts', split='test')
        case 'SICK-R':
            dataset = load_dataset('mteb/sickr-sts', split='test')

    dataset = dataset.rename_column('sentence1', 'text1')
    dataset = dataset.rename_column('sentence2', 'text2')
    dataset = dataset.rename_column('score', 'label')
    return dataset

In [None]:
dataset = get_sts_dataset('STS13')
spearman = test_sts(dataset, model_ft, tokenizer_ft, pooler_ft)

In [None]:
spearman