# Package preparation

In [1]:
!pip install transformers==4.30
!pip install datasets # huggingface dataset
!pip install bitsandbytes
!pip install peft
!pip install accelerate

[0m

In [2]:

! pip install nvidia-smi
import subprocess
if __name__ == "__main__":
    print(subprocess.check_output(['nvidia-smi']).decode('utf-8'))

[0mSun Jun 16 07:10:22 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.41.03              Driver Version: 530.41.03    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-SXM2-16GB            On | 00000000:1C:00.0 Off |                    0 |
| N/A   43C    P0               34W / 300W|      4MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                

## Import necessary packages

In [3]:
import os
import sys
import pandas as pd
from typing import List
from numpy import ndarray

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import Trainer
import torch.distributed as dist



from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, AutoModel
from transformers import set_seed
from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPoolingAndCrossAttentions
from tqdm.notebook import trange, tqdm

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
)

from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
import numpy as np
from dataclasses import dataclass

## Some useful functions

In [4]:
def sim_matrix(a, b, eps=1e-8):
    """
    Calculate cosine similarity between two matrices.
    Note: added eps for numerical stability
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.clamp(a_n, min=eps)
    b_norm = b / torch.clamp(b_n, min=eps)
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

def batch2device(batch, device):
    """
    Transfer batch of training to GPU/CPU
    Args:
        batch: Dict[str, Tensor], represent for transformer input (input_ids, attention_mask)
        device: torch.device, GPU or CPU
    """
    for key, value in batch.items():
        batch[key] = batch[key].to(device)
    return batch

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# GPU accelerator
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Data preparation

In [5]:
checkpoint_path = "Paper-submission/"

In [6]:
data_train = pd.read_csv(checkpoint_path + "01_train.csv", encoding = "ISO-8859-1")
data_validate = pd.read_csv(checkpoint_path + "01_validate.csv", encoding = "ISO-8859-1")
data_test = pd.read_csv(checkpoint_path + "01_test.csv", encoding = "ISO-8859-1")
data_aims = pd.read_csv(checkpoint_path + "01_aims.csv", encoding = "ISO-8859-1")

data_train.fillna("", inplace=True)
data_validate.fillna("", inplace=True)
data_test.fillna("", inplace=True)
data_aims.fillna("", inplace=True)

n_classes = len(data_aims)

## Feature selection

In [7]:
X_train = (
    data_train['Title']
    + " " + data_train['Abstract']
    + " " + data_train['Keywords']
    ).tolist()
X_valid = (
    data_validate['Title']
    + " " + data_validate['Abstract']
    + " " + data_validate['Keywords']
    ).tolist()
X_test = (
    data_test['Title']
    + " " + data_test['Abstract']
    + " " + data_test['Keywords']
    ).tolist()

X_aims = data_aims["Aims"].tolist()

Y_train = data_train['Label'].tolist()
Y_validate = data_validate['Label'].tolist()
Y_test = data_test['Label'].tolist()

## Tokenization

In [8]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")



In [9]:
train_encodings = tokenizer(
    X_train,
    truncation=True,
    padding="max_length",
    max_length=300,
    return_tensors="pt"
)
valid_encodings = tokenizer(
    X_valid,
    truncation=True,
    padding="max_length",
    max_length=300,
    return_tensors="pt"
)
test_encodings = tokenizer(
    X_test,
    truncation=True,
    padding="max_length",
    max_length=300,
    return_tensors="pt"
)

## Data loader

In [10]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        x = {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        }
        y = torch.tensor(self.labels[idx])
        return x, y
    def __len__(self):
        return len(self.labels)

In [11]:
# Dataset
train_dataset = Dataset(train_encodings, Y_train)
valid_dataset = Dataset(valid_encodings, Y_validate)
test_dataset = Dataset(test_encodings, Y_test)

In [12]:
# Data loaders
train_loader = torch.utils.data.DataLoader(train_dataset,
                                         batch_size=16,
                                         shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                         batch_size=8,
                                         shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                         batch_size=8,
                                         shuffle=False)

# Model definition

In [13]:
class MLPLayer(nn.Module):
    """
    Head for getting sentence representations over RoBERTa/BERT's CLS representation.
    """

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size * 2, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = self.activation(x)

        return x

class Similarity(nn.Module):
    """
    Dot product or cosine similarity
    """

    def __init__(self, temp):
        super().__init__()
        self.temp = temp
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, x, y):
        return self.cos(x, y) / self.temp

class AttentionLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.key = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size, bias=False),
            nn.RReLU()
        )

        self.query = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size, bias=False),
            nn.RReLU()
        )

        self.value = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size, bias=False),
            nn.RReLU()
        )

    def forward(self, source_pooler_outputs, target_outputs):
        target_pooler_output_list = []
        for idx, source_pooler_output in enumerate(source_pooler_outputs):
            concated = target_outputs
            num_target_outputs = len(concated)
            concated = torch.stack(concated, dim=1)

            K = self.key(concated)
            V = self.value(concated)
            Q = self.query(source_pooler_output)

            score_list = torch.bmm(K, Q.unsqueeze(dim=-1)).squeeze(dim=-1)
            score_list /= num_target_outputs
            score_list = F.softmax(score_list, dim=-1)
            V = torch.mul(V, score_list.unsqueeze(dim=-1))

            target_pooler_output = torch.sum(V, dim=1)
            target_pooler_output_list.append(target_pooler_output)

        target_pooler_outputs = torch.stack(target_pooler_output_list, dim=1)
        target_pooler_output = torch.mean(target_pooler_outputs, dim=1)

        pooler_output = torch.cat([source_pooler_outputs.pop(-1), target_pooler_output], dim=1)
        return pooler_output


## Pooler layer

In [14]:
class Pooler(nn.Module):
    """
    Parameter-free poolers to get the sentence embedding
    'cls': [CLS] representation with BERT/RoBERTa's MLP pooler.
    'cls_before_pooler': [CLS] representation without the original MLP pooler.
    'avg': average of the last layers' hidden states at each token.
    'avg_top2': average of the last two layers.
    'avg_first_last': average of the first and the last layers.
    """
    def __init__(self, pooler_type):
        super().__init__()
        self.pooler_type = pooler_type
        assert self.pooler_type in ["cls", "cls_before_pooler", "avg", "avg_top2", "avg_first_last"], "unrecognized pooling type %s" % self.pooler_type

    def forward(self, attention_mask, outputs):
        last_hidden = outputs.last_hidden_state
        hidden_states = outputs.hidden_states

        if self.pooler_type in ['cls_before_pooler', 'cls']:
            source_pooled_result = [hidden[:, 0] for hidden in hidden_states]
            target_pooled_result = [((hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)) for hidden in hidden_states]
            return source_pooled_result, target_pooled_result
        elif self.pooler_type == "avg":
            return ((last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1))
        elif self.pooler_type == "avg_first_last":
            first_hidden = hidden_states[0]
            last_hidden = hidden_states[-1]
            pooled_result = ((first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            return pooled_result
        elif self.pooler_type == "avg_top2":
            second_last_hidden = hidden_states[-2]
            last_hidden = hidden_states[-1]
            pooled_result = ((last_hidden + second_last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            return pooled_result
        else:
            raise NotImplementedError

## Sentence Embedder

In [15]:
class ModelForCL(nn.Module):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, model, model_name_or_path, pooler_type):
        super(ModelForCL, self).__init__()
        self.roberta = model
        self.pooler_type = pooler_type
        self.pooler = Pooler(self.pooler_type)

        self.config  = AutoConfig.from_pretrained(model_name_or_path)
        self.attn = AttentionLayer(self.config)
        self.mlp = MLPLayer(self.config)


    def forward(self,
      input_ids=None,
      attention_mask=None,
      token_type_ids=None,
      position_ids=None,
      head_mask=None,
      inputs_embeds=None,
      labels=None,
      output_attentions=None,
      output_hidden_states=None,
      return_dict=None,
      sent_emb=False,
      mlm_input_ids=None,
      mlm_labels=None
      ):

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=True if self.pooler_type in ['cls', 'avg_top2', 'avg_first_last'] else False,
            return_dict=True,
        )

        source_pooler_output, target_pooler_output = self.pooler(attention_mask, outputs)
        pooler_output = self.attn(source_pooler_output, target_pooler_output)
        pooler_output = self.mlp(pooler_output)


        return  BaseModelOutputWithPoolingAndCrossAttentions(
            pooler_output=pooler_output,
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states
        )
    def encode(self, sentences: Union[str, List[str]],
               batch_size: int = 8,
               show_progress_bar: bool = None,
               convert_to_numpy: bool = True,
               convert_to_tensor: bool = False,
               device: str = None) -> Union[List[Tensor], ndarray, Tensor]:
        self.eval()

        if convert_to_tensor:
            convert_to_numpy = False

        input_was_string = False

        if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1
            sentences = [sentences]
            input_was_string = True

        if device is None:
            device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

        self.to(device)

        all_embeddings = []
        for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
            sentence_batch = sentences[start_index: start_index+batch_size]
            features = tokenizer(sentence_batch,
                       padding='max_length',
                       truncation=True,
                       max_length=300,
                       return_tensors='pt').to(device)

            with torch.no_grad():
                out_features = self.forward(**features)
                embeddings = []
                # gather the embedding vectors
                for row in out_features.pooler_output:
                    embeddings.append(row.cpu())
                all_embeddings.extend(embeddings)
        if convert_to_tensor:
            all_embeddings = torch.vstack(all_embeddings)
        elif convert_to_numpy:
            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])

        if input_was_string:
            all_embeddings = all_embeddings[0]
        return all_embeddings

## Load fine-tuned LM

In [16]:
from transformers import BitsAndBytesConfig
model = AutoModel.from_pretrained(
    'roberta-base',
    load_in_4bit=True,
    config=None,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        llm_int8_threshold=6.0,
        llm_int8_has_fp16_weight=False,
        bnb_4bit_compute_dtype=torch.float32,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
    ),
    torch_dtype=torch.float32
)

model = prepare_model_for_kbit_training(model)
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    return list(lora_module_names)
target_modules = find_all_linear_names(model)
print(target_modules)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.1,
    bias="none",
    task_type="classification"
    )

model = get_peft_model(model, config)
from peft.tuners.lora import LoraLayer
for name, module in model.named_modules():
    if isinstance(module, LoraLayer):
        #module = module.to(torch.bfloat16)
        module = module.to(torch.float32)
    if 'norm' in name:
        module = module.to(torch.float32)
    if 'lm_head' in name or 'embed_tokens' in name:
        if hasattr(module, 'weight'):
            #module = module.to(torch.bfloat16)
            module = module.to(torch.float32)
model.print_trainable_parameters()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['query', 'value', 'dense', 'key']
trainable params: 1,339,392 || all params: 125,985,024 || trainable%: 1.0631


In [17]:
# Fine-tuned LM checkpoint (by contrastive learning)
checkpoint_cl = torch.load(checkpoint_path + "saved_model/Epoch_09 SupCL-RoBERTa.pth")

model_args = {
    'model': model,
    'model_name_or_path': 'roberta-base',
    'pooler_type': 'cls'
}
base_model = ModelForCL(**model_args)
base_model.load_state_dict(checkpoint_cl["model_state_dict"])

<All keys matched successfully>

## Model for downstream task

In [18]:
class WithAim_Classifier(nn.Module):
    def __init__(self, base_model, num_classes):
        super(WithAim_Classifier, self).__init__()
        self.base_model = base_model
        self.linear1_1 = nn.Linear(768, 512)
        self.act1_1 = nn.ReLU()
        self.drop1_1 = nn.Dropout(0.1)


        self.linear2_1 = nn.Linear(768, 512)
        self.act2_1 = nn.ReLU()


        self.linear_main_1 = nn.Linear(512+num_classes, num_classes)
        self.act_main_1 = nn.LogSoftmax(dim=1)

    def forward(self, inputs_tak, inputs_aims):
        '''
        Args:
            inputs_tak: (Dict) batch of TAK samples, shape as [bs, n_samples, encoding_dim]
            inputs_aims: (Tensor) batch of aims embeddings taken by cls tokens, shape as [bs, n_samples, hidden_size]
        '''
        output_tak = self.base_model(**inputs_tak)
        last_hidden = output_tak.last_hidden_state[:,0,:] # cls tokens
        x = self.linear1_1(last_hidden)
        x = self.act1_1(x)
        x = self.drop1_1(x)

        if inputs_aims is not None: # Aims
            y = self.linear2_1(inputs_aims)
            y = self.act2_1(y)

            cosine_feats = sim_matrix(x, y)
            concat_feats = torch.cat((x, cosine_feats), dim=1)

            out = self.linear_main_1(concat_feats)
            out = self.act_main_1(out)

            return out
        else:
            return x

In [19]:
model = WithAim_Classifier(base_model, n_classes)
model.to(device)

WithAim_Classifier(
  (base_model): ModelForCL(
    (roberta): PeftModel(
      (base_model): LoraModel(
        (model): RobertaModel(
          (embeddings): RobertaEmbeddings(
            (word_embeddings): Embedding(50265, 768, padding_idx=1)
            (position_embeddings): Embedding(514, 768, padding_idx=1)
            (token_type_embeddings): Embedding(1, 768)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (encoder): RobertaEncoder(
            (layer): ModuleList(
              (0-11): 12 x RobertaLayer(
                (attention): RobertaAttention(
                  (self): RobertaSelfAttention(
                    (query): lora.Linear4bit(
                      (base_layer): Linear4bit(in_features=768, out_features=768, bias=True)
                      (lora_dropout): ModuleDict(
                        (default): Dropout(p=0.1, inplace=False)
                      )


# Training

Firstly, we encode Aims&Scopes into the embedding features as external features for training

In [20]:
aims_embeddings = base_model.encode(X_aims, show_progress_bar=True , convert_to_tensor=True)
if torch.cuda.is_available():
    aims_embeddings = aims_embeddings.cuda()

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

## Optimizer and Loss function

In [21]:
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.96)

# Loss function
loss_fn = nn.NLLLoss().to(device)

## Training settings

In [22]:
max_epochs = 10
topks = [1, 3, 5, 10]
history = {
    "train_loss": [],
    "val_loss": [],
    "train_acc@k": [],
    "val_acc@k": [],
}
min_valid_loss = np.inf

In [23]:
checkpoint_dir = checkpoint_path + "weight/taks/"
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
checkpoints = os.listdir(checkpoint_dir)

if len(checkpoints)!=0:
    checkpoint_name = checkpoints[-1]

    print(f"{checkpoint_name} found. Loading...")
    checkpoint_cl = torch.load(checkpoint_dir + checkpoint_name)
    model.load_state_dict(checkpoint_cl["model_state_dict"])
    optimizer.load_state_dict(checkpoint_cl['optimizer_state_dict'])
    history = checkpoint_cl['history']
    epoch = checkpoint_cl["epoch"]
    train_loss = history["train_loss"][-1]
    min_valid_loss = history["val_loss"][-1]
    print("Model loaded successfully.")
    print("\tTraining loss: {}".format(train_loss))
    print("\tValidating loss: {}".format(min_valid_loss))
    print("\n")
    for k in topks:
        print("\tTrain accuracy@{}: {}".format(k, history["train_acc@k"][-1][k]))
    print("\n")
    for k in topks:
        print("\tValidate accuracy@{}: {}".format(k, history["val_acc@k"][-1][k]))
    epoch+=1
else:
    epoch = 0
    print(f"Starting training from epoch {epoch}")
    min_loss = np.inf

Epoch:03 Roberta_TAKS.pth found. Loading...
Model loaded successfully.
	Training loss: 1.566909830270999
	Validating loss: 1.563173109579213


	Train accuracy@1: 0.49676686209636056
	Train accuracy@3: 0.7818562133569323
	Train accuracy@5: 0.8668195241974141
	Train accuracy@10: 0.9425946224988854


	Validate accuracy@1: 0.49208073128789936
	Validate accuracy@3: 0.7812471716897457
	Validate accuracy@5: 0.8677406703472411
	Validate accuracy@10: 0.9435544694844179


## Training loop

In [24]:
for epoch in range(epoch,max_epochs):
    train_loss = 0.0
    train_loop = tqdm(train_loader, leave=True)
    batch_train_accuracy = {k: 0 for k in topks}
    batch_valid_accuracy = {k: 0 for k in topks}
    num_correct_at_k = {
        "train": {k: 0 for k in topks},
        "val": {k: 0 for k in topks}
    }
    # Training
    model.train()

    for features, labels in train_loop:

        # Transfer Data to GPU if available
        if torch.cuda.is_available():
            features, labels = batch2device(features, device), labels.to(device)
        # forward pass
        logits = model(features, aims_embeddings)
        # Clear the gradients
        optimizer.zero_grad()
        # Find the Loss
        loss = loss_fn(logits, labels)
        # Calculate gradients
        loss.backward()
        # Update Weights
        optimizer.step()
        # Calculate accuracy
        probs_des = torch.argsort(torch.exp(logits), axis=1, descending=True)
        for k in topks:
            batch_num_correct = 0
            nPoints = len(labels)
            for i in range(nPoints):
                if labels[i] in probs_des[i, 0:k]:
                    batch_num_correct += 1
                    num_correct_at_k["train"][k] += 1 # globally counting number of correct at each k's for whole valid set
            batch_train_accuracy[k] = batch_num_correct / nPoints
        # Calculate Loss
        train_loss += loss.item()
        train_loop.set_description('Epoch: {0} - lr: {1}, Training'.format(epoch, optimizer.param_groups[0]['lr']))
        train_loop.set_postfix(train_loss=loss.item(),
                               top01=batch_train_accuracy[1],
                               top03=batch_train_accuracy[3],
                               top05=batch_train_accuracy[5],
                               top10=batch_train_accuracy[10])
    train_loss = train_loss/len(train_loader)
    history["train_loss"].append(train_loss)
    history["train_acc@k"].append(
        {k: val/len(X_train) for k, val in num_correct_at_k["train"].items()}
    )

    # Validation
    valid_loss = 0.0
    valid_loop = tqdm(valid_loader, leave=True)
    with torch.no_grad():
        model.eval()
        # Transfer Data to GPU if available
        for features, labels in valid_loop:

            if torch.cuda.is_available():
                features, labels = batch2device(features, device), labels.to(device)
            # Forward pass
            logits = model(features, aims_embeddings)

            # Find the Loss
            loss = loss_fn(logits, labels)
            # Calculate accuracy
            probs_des = torch.argsort(torch.exp(logits), axis=1, descending=True)
            for k in topks:
                num_correct = 0
                nPoints = len(labels)
                for i in range(nPoints):
                    if labels[i] in probs_des[i, 0:k]:
                        num_correct += 1
                        num_correct_at_k["val"][k] += 1 # globally counting number of correct at each k's for whole valid set
                batch_valid_accuracy[k] = num_correct / nPoints
            # Calculate Loss
            valid_loss += loss.item()
            valid_loop.set_description('Epoch: {0} - lr: {1}, Validating'.format(epoch, optimizer.param_groups[0]['lr']))
            valid_loop.set_postfix(val_loss=loss.item(),
                                val_top01=batch_valid_accuracy[1],
                                val_top03=batch_valid_accuracy[3],
                                val_top05=batch_valid_accuracy[5],
                                val_top10=batch_valid_accuracy[10])
        valid_loss = valid_loss/len(valid_loader)
        history["val_loss"].append(valid_loss)
        history["val_acc@k"].append(
            {k: val/len(X_valid) for k, val in num_correct_at_k["val"].items()}
        )
        print(f'>> Epoch {epoch} \t\t Training Loss: {train_loss} \t\t Validation Loss: {valid_loss}')
        lr_scheduler.step()

        if min_valid_loss > valid_loss:
            print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
            min_valid_loss = valid_loss
            # Saving State Dict
            torch.save(
                {
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'history': history,
                    'epoch': epoch
                }, checkpoint_dir + "Epoch:{:0>2} Roberta_TAKS.pth".format(epoch)
            )

  0%|          | 0/18645 [00:00<?, ?it/s]

  key: torch.tensor(val[idx]) for key, val in self.encodings.items()


  0%|          | 0/4144 [00:00<?, ?it/s]

>> Epoch 4 		 Training Loss: 1.4840220895568625 		 Validation Loss: 1.521068972726011
Validation Loss Decreased(1.563173--->1.521069) 	 Saving The Model


  0%|          | 0/18645 [00:00<?, ?it/s]

  0%|          | 0/4144 [00:00<?, ?it/s]

>> Epoch 5 		 Training Loss: 1.4173233794379598 		 Validation Loss: 1.4834763264484905
Validation Loss Decreased(1.521069--->1.483476) 	 Saving The Model


  0%|          | 0/18645 [00:00<?, ?it/s]

  0%|          | 0/4144 [00:00<?, ?it/s]

>> Epoch 6 		 Training Loss: 1.358720708235431 		 Validation Loss: 1.4724285894587752
Validation Loss Decreased(1.483476--->1.472429) 	 Saving The Model


  0%|          | 0/18645 [00:00<?, ?it/s]

  0%|          | 0/4144 [00:00<?, ?it/s]

>> Epoch 7 		 Training Loss: 1.3097900385254349 		 Validation Loss: 1.450219482575403
Validation Loss Decreased(1.472429--->1.450219) 	 Saving The Model


  0%|          | 0/18645 [00:00<?, ?it/s]

  0%|          | 0/4144 [00:00<?, ?it/s]

>> Epoch 8 		 Training Loss: 1.266098761970922 		 Validation Loss: 1.4450662347333485
Validation Loss Decreased(1.450219--->1.445066) 	 Saving The Model


  0%|          | 0/18645 [00:00<?, ?it/s]

  0%|          | 0/4144 [00:00<?, ?it/s]

>> Epoch 9 		 Training Loss: 1.225402362473243 		 Validation Loss: 1.4348648520776262
Validation Loss Decreased(1.445066--->1.434865) 	 Saving The Model


# Testing

In [32]:
# load checkpoint and testing
checkpoints = os.listdir(checkpoint_dir)
checkpoint_name = checkpoints[-1]
checkpoint = torch.load(checkpoint_dir + checkpoint_name)


model = WithAim_Classifier(base_model, n_classes)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)

history = checkpoint['history']

In [33]:
# Loss function
loss_fn = nn.NLLLoss().to(device)

# Test
topks = [1, 3, 5, 10]
num_correct_at_k = {}
test_loop = tqdm(test_loader, leave=True)
num_correct_at_k["test"] = {k: 0 for k in topks}
batch_test_accuracy = {k: [] for k in topks}
history["test_acc@k"] = []
history["test_loss"] = []
test_loss = 0.0

with torch.no_grad():
    model.eval()
    for features, labels in test_loop:
        # Transfer Data to GPU if available
        if torch.cuda.is_available():
            features, labels = batch2device(features, device), labels.to(device)
        logits = model(features, aims_embeddings)
        # Find the Loss
        loss = loss_fn(logits, labels)
        # Calculate accuracy
        probs_des = torch.argsort(torch.exp(logits), axis=1, descending=True)
        for k in topks:
            num_correct = 0
            nPoints = len(labels)
            for i in range(nPoints):
                if labels[i] in probs_des[i, 0:k]:
                    num_correct += 1
                    num_correct_at_k["test"][k] += 1 # globally counting number of correct at each k's for whole valid set
            batch_test_accuracy[k] = num_correct / nPoints
        # Calculate Loss
        test_loss += loss.item()
        test_loop.set_description('Testing...')
        test_loop.set_postfix(test_loss=loss.item(),
                            test_top01=batch_test_accuracy[1],
                            test_top03=batch_test_accuracy[3],
                            test_top05=batch_test_accuracy[5],
                            test_top10=batch_test_accuracy[10])
    test_loss = test_loss/len(test_loader)
    history["test_loss"].append(test_loss)
    history["test_acc@k"].append(
        {k: val/len(X_test) for k, val in num_correct_at_k["test"].items()}
    )

  0%|          | 0/10381 [00:00<?, ?it/s]

  key: torch.tensor(val[idx]) for key, val in self.encodings.items()


# Final results

In [34]:
print(">> Final results (Best model): ")
print("\tTraining loss: {}".format(history["train_loss"][-1]))
print("\tValidating loss: {}".format(history["val_loss"][-1]))
print("\tTesting loss: {}".format(history["test_loss"][-1]))
print("\n")
for k in topks:
    print("\tTrain accuracy@{}: {}".format(k, history["train_acc@k"][-1][k]))
print("\n")
for k in topks:
    print("\tValidate accuracy@{}: {}".format(k, history["val_acc@k"][-1][k]))
print("\n")
for k in topks:
    print("\tTest accuracy@{}: {}".format(k, history["test_acc@k"][-1][k]))

>> Final results (Best model): 
	Training loss: 1.225402362473243
	Validating loss: 1.4348648520776262
	Testing loss: 1.449772312381854


	Train accuracy@1: 0.5688311427106065
	Train accuracy@3: 0.8536891963917578
	Train accuracy@5: 0.9214124572183282
	Train accuracy@10: 0.9717146525340493


	Validate accuracy@1: 0.5160044649591214
	Validate accuracy@3: 0.8152170633843183
	Validate accuracy@5: 0.8951036292877184
	Validate accuracy@10: 0.9567985036353215


	Test accuracy@1: 0.5142568153357094
	Test accuracy@3: 0.816058183219343
	Test accuracy@5: 0.8925801945862634
	Test accuracy@10: 0.9557846064926308
