# Package Installation

In [1]:
!pip install transformers==4.30
!pip install datasets # huggingface dataset
!pip install bitsandbytes
!pip install peft
!pip install accelerate

[0m

In [2]:
! pip install nvidia-smi
import subprocess
if __name__ == "__main__":
    print(subprocess.check_output(['nvidia-smi']).decode('utf-8'))

[0mWed Jun 12 17:11:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.41.03              Driver Version: 530.41.03    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-SXM2-16GB            On | 00000000:86:00.0 Off |                    0 |
| N/A   43C    P0               33W / 300W|      4MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                

# Preparation

## Import necessary packages

In [3]:
import os
import sys
import pandas as pd
from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import Trainer
import torch.distributed as dist



from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, AutoModel
from transformers import set_seed
from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPoolingAndCrossAttentions
from tqdm.notebook import trange, tqdm

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
)

from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
import numpy as np
from dataclasses import dataclass

# Data preparation

## Build set of paired samples

In [4]:
checkpoint_path = "Paper-submission/"

In [5]:
data_train = pd.read_csv(checkpoint_path + "01_train.csv", encoding = "ISO-8859-1")

In [6]:
# Load paper submission data and Journal's aims
data_train = pd.read_csv(checkpoint_path + "01_train.csv", encoding = "ISO-8859-1")
data_aims = pd.read_csv(checkpoint_path + "01_aims.csv", encoding = "ISO-8859-1")

data_train.fillna("", inplace=True)
data_aims.fillna("", inplace=True)

# merge two tables respect to Label and Index
merged_df = pd.merge(data_train[['Title', 'Abstract', 'Keywords', 'Label']], data_aims['Aims'], right_index=True, left_on='Label')

# construct set of pairs for contrastive fine-tuning
train_pairs = pd.DataFrame({'TAK': merged_df['Title'] + ' ' + merged_df['Abstract'] + ' ' + merged_df['Keywords'],
                            'Aims': data_aims['Aims']})
train_pairs.to_csv(checkpoint_path + "train_pairs.csv", index=False)

## Load saved pairs for training

In [7]:
data_args = {
    "train_file": checkpoint_path + "train_pairs.csv",
    "preprocessing_num_workers": None
}
data_files = {
    "train": data_args["train_file"]
}
tokenizer_kwargs = {
    "pretrained_path": 'roberta-base',
    "use_fast": True,
    "max_seq_length": 300,
    "pad_to_max_length": True,
    "truncation": True,
    "return_tensors": None
}

datasets = load_dataset("csv", data_files=data_files)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_kwargs["pretrained_path"],
    use_fast=tokenizer_kwargs["use_fast"]
)
column_names = datasets["train"].column_names

def prepare_features(examples):
    total = len(examples[column_names[0]])
    for idx in range(total):
        if examples[column_names[0]][idx] is None:
            examples[column_names[0]][idx] = " "
        if examples[column_names[1]][idx] is None:
            examples[column_names[1]][idx] = " "
    sentences = examples[column_names[0]] + examples[column_names[1]]
    sent_features = tokenizer(
            sentences,
            max_length=tokenizer_kwargs["max_seq_length"],
            truncation=True,
            padding="max_length" if tokenizer_kwargs["pad_to_max_length"] else False,
            return_tensors=tokenizer_kwargs["return_tensors"]
        )
    features = {}
    for key in sent_features:
        features[key] = [[sent_features[key][i], sent_features[key][i+total]] for i in range(total)]
    return features

train_dataset = datasets["train"].map(
    prepare_features,
    batched=True,
    num_proc=data_args["preprocessing_num_workers"],
    remove_columns=column_names
)

Generating train split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/298317 [00:00<?, ? examples/s]

# Model definition

In [8]:
class MLPLayer(nn.Module):
    """
    Head for getting sentence representations over RoBERTa/BERT's CLS representation.
    """

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size * 2, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = self.activation(x)

        return x

class Similarity(nn.Module):
    """
    Dot product or cosine similarity
    """

    def __init__(self, temp):
        super().__init__()
        self.temp = temp
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, x, y):
        return self.cos(x, y) / self.temp

class AttentionLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.key = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size, bias=False),
            nn.RReLU()
        )

        self.query = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size, bias=False),
            nn.RReLU()
        )

        self.value = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size, bias=False),
            nn.RReLU()
        )

    def forward(self, source_pooler_outputs, target_outputs):
        target_pooler_output_list = []
        for idx, source_pooler_output in enumerate(source_pooler_outputs):
            concated = target_outputs
            num_target_outputs = len(concated)
            concated = torch.stack(concated, dim=1)

            K = self.key(concated)
            V = self.value(concated)
            Q = self.query(source_pooler_output)

            score_list = torch.bmm(K, Q.unsqueeze(dim=-1)).squeeze(dim=-1)
            score_list /= num_target_outputs
            score_list = F.softmax(score_list, dim=-1)
            V = torch.mul(V, score_list.unsqueeze(dim=-1))

            target_pooler_output = torch.sum(V, dim=1)
            target_pooler_output_list.append(target_pooler_output)

        target_pooler_outputs = torch.stack(target_pooler_output_list, dim=1)
        target_pooler_output = torch.mean(target_pooler_outputs, dim=1)

        pooler_output = torch.cat([source_pooler_outputs.pop(-1), target_pooler_output], dim=1)
        return pooler_output


## Pooler layer

In [9]:
class Pooler(nn.Module):
    """
    Parameter-free poolers to get the sentence embedding
    'cls': [CLS] representation with BERT/RoBERTa's MLP pooler.
    'cls_before_pooler': [CLS] representation without the original MLP pooler.
    'avg': average of the last layers' hidden states at each token.
    'avg_top2': average of the last two layers.
    'avg_first_last': average of the first and the last layers.
    """
    def __init__(self, pooler_type):
        super().__init__()
        self.pooler_type = pooler_type
        assert self.pooler_type in ["cls", "cls_before_pooler", "avg", "avg_top2", "avg_first_last"], "unrecognized pooling type %s" % self.pooler_type

    def forward(self, attention_mask, outputs):
        last_hidden = outputs.last_hidden_state
        hidden_states = outputs.hidden_states

        if self.pooler_type in ['cls_before_pooler', 'cls']:
            source_pooled_result = [hidden[:, 0] for hidden in hidden_states]
            target_pooled_result = [((hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)) for hidden in hidden_states]
            return source_pooled_result, target_pooled_result
        elif self.pooler_type == "avg":
            return ((last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1))
        elif self.pooler_type == "avg_first_last":
            first_hidden = hidden_states[0]
            last_hidden = hidden_states[-1]
            pooled_result = ((first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            return pooled_result
        elif self.pooler_type == "avg_top2":
            second_last_hidden = hidden_states[-2]
            last_hidden = hidden_states[-1]
            pooled_result = ((last_hidden + second_last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            return pooled_result
        else:
            raise NotImplementedError

# Model for contrastive leanring training

In [10]:
class ModelForCL(nn.Module):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, model, model_name_or_path, pooler_type):
        super(ModelForCL, self).__init__()
        self.roberta = model
        self.pooler_type = pooler_type
        self.pooler = Pooler(self.pooler_type)

        self.config  = AutoConfig.from_pretrained(model_name_or_path)
        self.attn = AttentionLayer(self.config)
        self.mlp = MLPLayer(self.config)


    def forward(self,
      input_ids=None,
      attention_mask=None,
      token_type_ids=None,
      position_ids=None,
      head_mask=None,
      inputs_embeds=None,
      labels=None,
      output_attentions=None,
      output_hidden_states=None,
      return_dict=None,
      sent_emb=False,
      mlm_input_ids=None,
      mlm_labels=None
      ):
        ori_input_ids = input_ids
        batch_size = input_ids.size(0)
        # Number of sentences in one instance
        # 2: pair instance; 3: pair instance with a hard negative
        num_sent = input_ids.size(1)

        mlm_outputs = None
        # Flatten input for encoding
        input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
        attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)
        if token_type_ids is not None:
            token_type_ids = token_type_ids.view((-1, token_type_ids.size(-1))) # (bs * num_sent, len)

        # Get raw embeddings
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=True if self.pooler_type in ['cls', 'avg_top2', 'avg_first_last'] else False,
            return_dict=True,
        )

        source_pooler_output, target_outputs = self.pooler(attention_mask, outputs)
        pooler_output = self.attn(source_pooler_output, target_outputs)
        pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1))) # (bs, num_sent, hidden)

        pooler_output = pooler_output.view((batch_size * num_sent, pooler_output.size(-1)))  # (bs, num_sent, hidden)
        pooler_output = self.mlp(pooler_output)
        pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1)))  # (bs, num_sent, hidden)


        return pooler_output

## Contrastive Loss

In [11]:
class SupervisedContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.1):
        super(SupervisedContrastiveLoss, self).__init__()
        self.temperature = temperature
        self.sim = nn.CosineSimilarity()
    def _eval_denom(self, z1, z2):
        cosine_vals = []
        for v in z1:
            cosine_vals.append(self.sim(v.view(1,-1), z2)/self.temperature)
        cos_batch = torch.cat(cosine_vals, dim=0).view(z1.shape[0], -1)
        denom = torch.sum(torch.exp(cos_batch),dim=1)
        return denom
    def _contrastive_loss(self, z1, z2):
        num = torch.exp(self.sim(z1, z2)/self.temperature)
        denom = self._eval_denom(z1, z2)
        loss = -torch.mean(torch.log(num/denom))
        return loss
    def forward(self, z1, z2):
        return self._contrastive_loss(z1, z2)

# Training

In [12]:
from transformers import BitsAndBytesConfig
model = AutoModel.from_pretrained(
    'roberta-base',
    load_in_4bit=True,
    config=None,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        llm_int8_threshold=6.0,
        llm_int8_has_fp16_weight=False,
        bnb_4bit_compute_dtype=torch.float32,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
    ),
    torch_dtype=torch.float32
)

model = prepare_model_for_kbit_training(model)
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    return list(lora_module_names)
target_modules = find_all_linear_names(model)
print(target_modules)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.1,
    bias="none",
    task_type="classification"
    )

model = get_peft_model(model, config)
from peft.tuners.lora import LoraLayer
for name, module in model.named_modules():
    if isinstance(module, LoraLayer):
        #module = module.to(torch.bfloat16)
        module = module.to(torch.float32)
    if 'norm' in name:
        module = module.to(torch.float32)
    if 'lm_head' in name or 'embed_tokens' in name:
        if hasattr(module, 'weight'):
            #module = module.to(torch.bfloat16)
            module = module.to(torch.float32)
model.print_trainable_parameters()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['dense', 'query', 'key', 'value']
trainable params: 1,339,392 || all params: 125,985,024 || trainable%: 1.0631


In [13]:
model_args = {
    'model': model,
    'model_name_or_path': 'roberta-base',
    'pooler_type': 'cls'
}
model = ModelForCL(**model_args)

In [14]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        x = {
            key: torch.tensor(val) for key, val in self.dataset[idx].items()
        }
        return x
    def __len__(self):
        return len(self.dataset)
dataset = Dataset(train_dataset)
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [15]:
decayRate = 0.86
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=2, gamma=decayRate)

loss_fn = SupervisedContrastiveLoss(0.1)

In [16]:
def batch2device(batch, device):
    for key, value in batch.items():
        batch[key] = batch[key].to(device)
    return batch

# GPU accelerator
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

ModelForCL(
  (roberta): PeftModel(
    (base_model): LoraModel(
      (model): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): L

In [17]:
epoch = 0
max_epochs = 10
save_path = checkpoint_path + 'saved_model/'
checkpoint_dir = checkpoint_path + "saved_model/"

if not os.path.exists(save_path):
    os.makedirs(save_path)
checkpoints = os.listdir(checkpoint_dir)
if len(checkpoints)!=0:
    checkpoint_name = checkpoints[-1]

    print(f"{checkpoint_name} found. Loading...")
    checkpoint_cl = torch.load(checkpoint_dir + checkpoint_name)
    model.load_state_dict(checkpoint_cl["model_state_dict"])
    min_loss = checkpoint_cl["min_loss"]
    epoch = checkpoint_cl["epoch"]
    print("Model loaded successfully.")
    print(f"Min loss at epoch {epoch}: {min_loss}")
    epoch+=1
else:
    print(f"Starting training from epoch {epoch}")
    min_loss = np.inf

for epoch in range(epoch,max_epochs):


    loop = tqdm(train_dataloader, leave=True)
    train_loss = 0.0


    for batch in loop:
        optimizer.zero_grad()


        if torch.cuda.is_available():
        # Transfer batch of samples to GPU
            inputs = batch2device(batch, device)

        outputs = model(**inputs)

        # Separate representation
        z1, z2 = outputs[:,0], outputs[:,1]


        # backward
        loss = loss_fn(z1, z2)
        loss.backward()
        train_loss += loss.item()
        # Update Weights
        optimizer.step()

        loop.set_description('Epoch: {} - lr:{}'.format(epoch, optimizer.param_groups[0]['lr']))
        loop.set_postfix(loss=loss.item())
    train_loss = train_loss / len(train_dataloader)
    lr_scheduler.step()
    if min_loss > train_loss:
        print(f">> Loss Decreased({min_loss:.6f}--->{train_loss:.6f})")
        min_loss = train_loss
        torch.save({
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "min_loss": min_loss,
            "epoch": epoch
        }, save_path + "Epoch:{:0>2} SupCL-RoBERTa.pth".format(epoch))


Epoch:05 SupCL-RoBERTa.pth found. Loading...
Model loaded successfully.
Min loss at epoch 5: 2.771908940128297


  0%|          | 0/18645 [00:00<?, ?it/s]

>> Loss Decreased(2.771909--->2.771375)


  0%|          | 0/18645 [00:00<?, ?it/s]

>> Loss Decreased(2.771375--->2.770707)


  0%|          | 0/18645 [00:00<?, ?it/s]

>> Loss Decreased(2.770707--->2.769800)


  0%|          | 0/18645 [00:00<?, ?it/s]

>> Loss Decreased(2.769800--->2.769392)
