<a href="https://colab.research.google.com/github/shubhamitradas/bert_optimization_strategies/blob/main/multi_label_with_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers nvidia-ml-py3 --quiet

In [2]:
import transformers
print(f"Transformers package version: {transformers.__version__}")

Transformers package version: 4.19.2


In [3]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [4]:
# Import all libraries
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm

# Huggingface transformers
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification,AutoTokenizer,AutoModel
from transformers import AlbertTokenizer, AlbertModel


import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler



from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [5]:
checkPoint = 'distilbert-base-uncased' 

In [6]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/')

zip_ref = zipfile.ZipFile("/content/drive/My Drive/toxic_train.csv.zip", 'r')
zip_ref.extractall("/content")
zip_ref.close()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [7]:
import pandas as pd
df_full = pd.read_csv('/content/train.csv')
df_full.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split


texts = list(df_full["comment_text"])
label_names = df_full.drop(["id", "comment_text"], axis=1).columns
labels = df_full[label_names].values

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)



In [9]:
print_gpu_utilization()

GPU memory occupied: 0 MB.


In [9]:
from transformers import AutoConfig, AutoTokenizer, AutoModel

MAX_LENGTH = 200
BATCH_SIZE = 128
LEARNING_RATE = 1e-05

MODEL_NAME =  checkPoint
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 


In [11]:
print_gpu_utilization()

GPU memory occupied: 0 MB.


In [10]:
class MultiLabelDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_len):

        self.encoded_inputs = tokenizer(texts, truncation=True, padding=True, 
            max_length=max_len, return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs_ands_labels = dict()
        inputs_ands_labels['input_ids'] = self.encoded_inputs['input_ids'][index]  
        inputs_ands_labels['attention_mask'] = self.encoded_inputs['attention_mask'][index] 
        inputs_ands_labels['token_type_ids'] = None 
        inputs_ands_labels['labels'] = self.labels[index]
        return inputs_ands_labels


train_dataset = MultiLabelDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
test_dataset = MultiLabelDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)
      
    

In [11]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [12]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 
from torch.nn.functional import binary_cross_entropy_with_logits

class FineTunedBERTClass(torch.nn.Module):
    def __init__(self):
        super(FineTunedBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(MODEL_NAME)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 6)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)

        if labels is None:
            return output
        else:
            return binary_cross_entropy_with_logits(output, labels), output


model = FineTunedBERTClass()

model.to(device)

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias',

FineTunedBERTClass(
  (DistilBertModel): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [13]:
print_gpu_utilization()

GPU memory occupied: 1968 MB.


In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='tmp',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_first_step=True,
    gradient_accumulation_steps=4,
    dataloader_num_workers =4,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset  
)
result = trainer.train()
print_summary(result)

Using amp half precision backend
***** Running training *****
  Num examples = 127656
  Num Epochs = 3
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 4
  Total optimization steps = 747


Epoch,Training Loss,Validation Loss
0,0.6821,0.055281
1,0.6821,0.041615
2,0.6821,0.038014


***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128
Saving model checkpoint to tmp/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 2072.45
Samples/second: 184.79
GPU memory occupied: 10758 MB.
