<a href="https://colab.research.google.com/github/shubhamitradas/bert_optimization_strategies/blob/main/multi_label_with_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers nvidia-ml-py3 --quiet

[K     |████████████████████████████████| 4.2 MB 5.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 45.9 MB/s 
[K     |████████████████████████████████| 596 kB 50.9 MB/s 
[K     |████████████████████████████████| 86 kB 4.9 MB/s 
[?25h

In [2]:
# Install Weights and Biases
!pip install wandb -q

# Import wandb
import wandb

# Login with your authentication key
wandb.login()

# setup wandb environment variables
#%env WANDB_ENTITY=your-username/your-team-name
%env WANDB_PROJECT=toxic_multilabel_with_trainer

[K     |████████████████████████████████| 1.8 MB 4.8 MB/s 
[K     |████████████████████████████████| 181 kB 63.5 MB/s 
[K     |████████████████████████████████| 145 kB 47.8 MB/s 
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


env: WANDB_PROJECT=toxic_multilabel_with_trainer


In [None]:
# Code for TPU packages install
#!curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
#!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [5]:
# Preparing for TPU usage
#import torch_xla
#import torch_xla.core.xla_model as xm
#device = xm.xla_device()

In [3]:
import transformers
print(f"Transformers package version: {transformers.__version__}")

Transformers package version: 4.19.4


In [4]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
# Import all libraries
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm

# Huggingface transformers
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification,AutoTokenizer,AutoModel,DataCollatorWithPadding
from transformers import AlbertTokenizer, AlbertModel


import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler



from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [6]:
checkPoint = 'distilbert-base-uncased' 

In [8]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/')

zip_ref = zipfile.ZipFile("/content/drive/My Drive/toxic_train.csv.zip", 'r')
zip_ref.extractall("/content")
zip_ref.close()

Mounted at /content/drive/


In [9]:
import pandas as pd
df_full = pd.read_csv('/content/train.csv')
df_full.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split


texts = list(df_full["comment_text"])
label_names = df_full.drop(["id", "comment_text"], axis=1).columns
labels = df_full[label_names].values

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)



In [11]:
print_gpu_utilization()

GPU memory occupied: 0 MB.


In [12]:
from transformers import AutoConfig, AutoTokenizer, AutoModel

MAX_LENGTH = 200
BATCH_SIZE = 128
LEARNING_RATE = 1e-05

MODEL_NAME =  checkPoint
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [13]:
print_gpu_utilization()

GPU memory occupied: 0 MB.


In [14]:
class MultiLabelDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_len):

        self.encoded_inputs = tokenizer(texts, truncation=True, padding=True, 
            max_length=max_len, return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs_ands_labels = dict()
        inputs_ands_labels['input_ids'] = self.encoded_inputs['input_ids'][index]  
        inputs_ands_labels['attention_mask'] = self.encoded_inputs['attention_mask'][index] 
        inputs_ands_labels['token_type_ids'] = None 
        inputs_ands_labels['labels'] = self.labels[index]
        return inputs_ands_labels


train_dataset = MultiLabelDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
test_dataset = MultiLabelDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)
      
    

In [15]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [16]:
# Creating the customized model, by adding a drop out and a dense layer on top of distill bert to get the final output for the model. 
from torch.nn.functional import binary_cross_entropy_with_logits

class FineTunedBERTClass(torch.nn.Module):
    def __init__(self):
        super(FineTunedBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(MODEL_NAME)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 6)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)

        if labels is None:
            return output
        else:
            return binary_cross_entropy_with_logits(output, labels), output




In [17]:
def model_init():

  model = FineTunedBERTClass()

  model.to(device)
  return model
    

In [18]:
print_gpu_utilization()

GPU memory occupied: 3 MB.


In [19]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

# **Vanilla Training.**

In [20]:

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='tmp',
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=200,
    run_name="vanilla",
    report_to="wandb",
    seed=123
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
   
)
result = trainer.train()
print_summary(result)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.19.4",
  "vocab_size": 30522
}

https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/huggingf

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

storing https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a
creating metadata file for /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a
loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.w

[34m[1mwandb[0m: Currently logged in as: [33mshubhamitra_das[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
200,No log,0.068124,0.725196,0.820544,0.919317
400,No log,0.047816,0.764302,0.866382,0.921761
600,0.145000,0.042882,0.770864,0.86119,0.924174
800,0.145000,0.041407,0.7585,0.835427,0.924675


***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128
Saving model checkpoint to tmp/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 2569.19
Samples/second: 49.69
GPU memory occupied: 13984 MB.


## **Gradient Accumulation**

In [19]:
from transformers import Trainer, TrainingArguments

BATCH_SIZE = 32
VALID_BATCH_SIZE = 128
training_args = TrainingArguments(
    output_dir='tmp',
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    group_by_length = True,
    dataloader_num_workers =4,
    gradient_accumulation_steps=4,
    seed=123
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
   
)
result = trainer.train()
print_summary(result)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
  cpuset_checked))
***** Running training *****
  Num examples = 127656
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 997


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
0,0.0493,0.036744,0.793663,0.875943,0.929532


Saving model checkpoint to tmp/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
  cpuset_checked))
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 2204.63
Samples/second: 57.90
GPU memory occupied: 14354 MB.


## **fp16**

In [28]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='tmp',
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=200,
    group_by_length = True,
    dataloader_num_workers =4,
    fp16=True,
    run_name="fp16",
    report_to="wandb",
    seed=123
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
   
)
result = trainer.train()
print_summary(result)

PyTorch: setting up devices
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.19.4",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingfac

Step,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
200,No log,0.068118,0.725297,0.820615,0.919348
400,No log,0.047802,0.762045,0.863024,0.92173
600,0.145100,0.042766,0.770929,0.859866,0.924675
800,0.145100,0.04152,0.756791,0.83376,0.924581


***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128
  cpuset_checked))
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128
Saving model checkpoint to tmp/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
  cpuset_checked))
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128
  cpuset_checked))
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 857.72
Samples/second: 148.83
GPU memory occupied: 10708 MB.


## **AdaFactor**

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='tmp',
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    group_by_length = True,
    optim="adafactor",
    dataloader_num_workers =4,
    seed=123
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
   
)
result = trainer.train()
print_summary(result)

  cpuset_checked))
***** Running training *****
  Num examples = 127656
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 998


Epoch,Training Loss,Validation Loss


## **8-bit Adam**

In [None]:
!pip install bitsandbytes-cuda112 -q

In [None]:
import bitsandbytes as bnb
from torch import nn
from transformers.trainer_pt_utils import get_parameter_names

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='tmp',
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    evaluation_strategy="epoch",
    group_by_length = True,
    dataloader_num_workers =4,
    seed=123
)

decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

optimizer_kwargs = {
    "betas": (training_args.adam_beta1, training_args.adam_beta2),
    "eps": training_args.adam_epsilon,
}
optimizer_kwargs["lr"] = training_args.learning_rate
adam_bnb_optim = bnb.optim.Adam8bit(
    optimizer_grouped_parameters,
    betas=(training_args.adam_beta1, training_args.adam_beta2),
    eps=training_args.adam_epsilon,
    lr=training_args.learning_rate,
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(adam_bnb_optim, None),
    compute_metrics=compute_metrics,
   
)
result = trainer.train()
print_summary(result)

In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=512)

In [22]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='tmp',
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    #logging_dir='./logs',
    evaluation_strategy="epoch",
    group_by_length = True,
    gradient_accumulation_steps=4,
    dataloader_num_workers =4
    #fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
   
)
result = trainer.train()
print_summary(result)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 127656
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 4
  Total optimization steps = 249


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
0,No log,0.055283,0.739541,0.822198,0.922952


***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 992.74
Samples/second: 128.59


In [16]:
!pip install bitsandbytes-cuda112 -q

[K     |████████████████████████████████| 4.2 MB 17.6 MB/s 
[?25h

In [15]:
!nvidia-smi

Mon Jun 13 06:52:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P0    28W /  70W |   1698MiB / 15109MiB |      2%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [17]:
import bitsandbytes as bnb
from torch import nn
from transformers.trainer_pt_utils import get_parameter_names

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='tmp',
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    #logging_dir='./logs',
    evaluation_strategy="epoch",
    group_by_length = True,
    gradient_accumulation_steps=4,
    dataloader_num_workers =4,
    fp16=True
)

decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

optimizer_kwargs = {
    "betas": (training_args.adam_beta1, training_args.adam_beta2),
    "eps": training_args.adam_epsilon,
}
optimizer_kwargs["lr"] = training_args.learning_rate
adam_bnb_optim = bnb.optim.Adam8bit(
    optimizer_grouped_parameters,
    betas=(training_args.adam_beta1, training_args.adam_beta2),
    eps=training_args.adam_epsilon,
    lr=training_args.learning_rate,
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(adam_bnb_optim, None),
    compute_metrics=compute_metrics,
   
)
result = trainer.train()
print_summary(result)

Using amp half precision backend
  cpuset_checked))
***** Running training *****
  Num examples = 127656
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 4
  Total optimization steps = 249


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
0,No log,0.050027,0.768426,0.874674,0.921072


***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 671.14
Samples/second: 190.21
GPU memory occupied: 10218 MB.
