<a href="https://colab.research.google.com/github/shubhamitradas/bert_optimization_strategies/blob/main/optimized_toxic_distilbert_multi_label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q transformers --quiet

In [2]:
# Import all libraries
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm

# Huggingface transformers
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification,AutoTokenizer,AutoModel
from transformers import AlbertTokenizer, AlbertModel


import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler



from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score

In [3]:
checkPoint = 'distilbert-base-uncased' 

In [4]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/')

zip_ref = zipfile.ZipFile("/content/drive/My Drive/toxic_train.csv.zip", 'r')
zip_ref.extractall("/content")
zip_ref.close()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [5]:
import pandas as pd
df_full = pd.read_csv('/content/train.csv')
df_full.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
df_full.drop(['id'], inplace=True, axis=1)

In [7]:
df = pd.DataFrame()
df['comment_text'] = df_full['comment_text']
df['labels'] = df_full.iloc[:, 1:].values.tolist()

In [8]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 128
VALID_BATCH_SIZE = 64
EPOCHS = 1
LEARNING_RATE = 1e-05

tokenizer = AutoTokenizer.from_pretrained(checkPoint,truncation=True, do_lower_case=True)

In [9]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            #padding = 'longest',
            #truncation = True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=False
        )

      
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        #token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
             #'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }
      
    

In [10]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = MultiLabelDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (159571, 2)
TRAIN Dataset: (127657, 2)
TEST Dataset: (31914, 2)


In [11]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 4,
                'pin_memory': True
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 4,
                'pin_memory': True
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

  cpuset_checked))


In [12]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [13]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 
freeze_embedding = True
class FineTunedBERTClass(torch.nn.Module):
    def __init__(self):
        #super(FineTunedBERTClass, self).__init__()
        super().__init__()


        # configuring the model to output hidden states
        cfg = transformers.PretrainedConfig.get_config_dict(checkPoint)[0]
        cfg["output_hidden_states"] = True
        cfg["gradient_checkpointing"] = True
        cfg = transformers.DistilBertConfig.from_dict(cfg)
        self.DistilBertModel = transformers.DistilBertModel.from_pretrained(checkPoint, config=cfg)


        self.l1 = AutoModel.from_pretrained(checkPoint)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 6)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = FineTunedBERTClass()
#model.DistilBertModel.embeddings.requires_grad_(not freeze_embedding)
model.modules


model.to(device)

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight

FineTunedBERTClass(
  (DistilBertModel): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [14]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [15]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [16]:
def train(epoch):

    model.train()
    accum_iter = 8
    optimizer.zero_grad(set_to_none=True)
    print(len(training_loader))

    for batch_idx, data in tqdm(enumerate(training_loader)):
        ids = data['ids'].to(device, dtype = torch.long,non_blocking=True)
        mask = data['mask'].to(device, dtype = torch.long,non_blocking=True)
        #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float,non_blocking=True)


        # Forward pass with mixed precision
        with torch.cuda.amp.autocast(): # autocast as a context manager
              outputs = model(ids, mask, token_type_ids= None)
              loss = loss_fn(outputs, targets)
        # normalize loss to account for batch accumulation
              loss = loss / accum_iter 

        if batch_idx%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')


        # Backward pass without mixed precision
        # It's not recommended to use mixed precision for backward pass
        # Because we need more precise loss
        loss.backward()

        #weights update
        if ((batch_idx + 1) % accum_iter == 0) or (batch_idx + 1 == len(training_loader)):
             optimizer.step()
             # Reset the gradients to None
             optimizer.zero_grad(set_to_none=True)

In [17]:
import os
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

In [18]:
for epoch in range(EPOCHS):
    train(epoch)

998


  cpuset_checked))
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs o

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.08589591830968857


In [19]:
from google.colab import drive
drive.mount('/content/gdrive')
model_save_name = 'distilbert_optimized_toxic_classifier.pt'
path = f"/content/gdrive/My Drive/{model_save_name}" 
torch.save(model.state_dict(), path)

Mounted at /content/gdrive


In [20]:

import time
def cpu_validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    timings=np.zeros((len(testing_loader),1))

    with torch.no_grad():
        for idx, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            start_time = time.time()
            outputs = model(ids, mask, token_type_ids=None)
            end_time = time.time()
            curr_time = end_time - start_time
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            timings[idx] = curr_time
    return fin_outputs, fin_targets, timings

In [21]:
def gpu_validation():

    data = next(iter(testing_loader)) 
    print("Testing records: ",len(testing_loader))
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)

    model.eval()
    fin_targets=[]
    fin_outputs=[]
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
    timings=np.zeros((len(testing_loader),1))
    #GPU-WARM-UP
    for _ in range(10):
       _ = model(ids, mask, token_type_ids=None)
    # MEASURE PERFORMANCE

    with torch.no_grad():
        for idx, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            starter.record()
            outputs = model(ids, mask, token_type_ids=None)
            ender.record()
            # WAIT FOR GPU SYNC
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)
            timings[idx] = curr_time
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets,timings

In [22]:
from sklearn import metrics


def print_metrics(model,device="cpu"):

    if device == "cpu":
       outputs, targets,timings = cpu_validation()
    else:
       outputs, targets,timings = gpu_validation()

   
    latency = np.sum(timings) / len(testing_loader)
    if device == "gpu":
        print("Device Name: ", torch.cuda.get_device_name(0))
        print("Throughtput: ", len(testing_loader)/np.sum(timings)*1000)
    else:
        print("Throughtput: ", len(testing_loader)/np.sum(timings))
             
    print("Checkpoint Model: ",checkPoint)
    print(print_size_of_model(model))
    print("Latency in ms: ",latency)


    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')

    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")


In [23]:
from google.colab import drive
drive.mount('/content/gdrive')
model_save_name = 'distilbert_optimized_toxic_classifier.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
model.load_state_dict(torch.load(path,map_location=torch.device('cuda')))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


<All keys matched successfully>

In [24]:
print_metrics(model,device="gpu")

  cpuset_checked))
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs o

Testing records:  499


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

Device Name:  Tesla T4
Throughtput:  2.8654382102239353
Checkpoint Model:  distilbert-base-uncased
Size (MB): 533.363881
None
Latency in ms:  348.9867610587386
Throughtput:  0.002865438210223935
Accuracy Score = 0.9178730337782791
F1 Score (Micro) = 0.7206681900559452
F1 Score (Macro) = 0.38071049871067175


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
model_save_name = 'distilbert_optimized_toxic_classifier.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
model.load_state_dict(torch.load(path,map_location=torch.device('cpu')))

Mounted at /content/gdrive


<All keys matched successfully>

In [None]:
print_metrics(model,device="cpu")

  cpuset_checked))
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs o

0it [00:00, ?it/s]