In [1]:
!nvidia-smi

Sat Apr 13 19:28:54 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 552.12                 Driver Version: 552.12         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   48C    P5             14W /   25W |     170MiB /   6144MiB |     36%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Importing the libraries
from datasets import load_dataset
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset

from transformers import DistilBertForSequenceClassification, AutoTokenizer, AutoConfig # ForSequenceClassification part adds the sequence classification head to the distilbert model

from tqdm import tqdm
import torch
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Device check if GPU is available for not
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [13]:
# Dataset loading

dataset = load_dataset("Arsive/toxicity_classification_jigsaw")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 25960
    })
    validation: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 6490
    })
    test: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 153164
    })
})

In [5]:
# Instantiating the tokenizer and model

checkpoint = "distilbert/distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
distilbert_model = DistilBertForSequenceClassification.from_pretrained(checkpoint,num_labels=6,problem_type="multi_label_classification") # AutoModelForSequenceClassification does the same thing as well (adding sequence classification head)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
#Dataset Info
split_lengths = [len(dataset[split])for split in dataset]
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

print(f"Split lengths: {split_lengths}") # Record size of Train , Test and Validation datasets
print(f"Features: {dataset['train'].column_names}")
print("\nComment:")

print(dataset["train"][1]["comment_text"])

print("\nLabel values")

for x in label_columns:
    value = {dataset["train"][0][x]}
    print(f"{x} --> {value}")

Split lengths: [25960, 6490, 153164]
Features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

Comment:
You NDP attack queers, sorry, Cabal of Sanctimonious Hypocrites, will go to any length to keep wikipedia as your own personal revisionist paradise, won't you?

Label values
toxic --> {1}
severe_toxic --> {0}
obscene --> {1}
threat --> {0}
insult --> {1}
identity_hate --> {1}


In [7]:
# Getting the configurations of the model

config = AutoConfig.from_pretrained(checkpoint)

# Print the configuration
print(config)

DistilBertConfig {
  "_name_or_path": "distilbert/distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.39.3",
  "vocab_size": 30522
}



In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 25960
    })
    validation: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 6490
    })
    test: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 153164
    })
})

In [36]:
# Concatenating the label columns to get a single column with the label values
columns_to_concat = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

dataset_train = pd.DataFrame(dataset['train'])
dataset_validation = pd.DataFrame(dataset['validation'])
dataset_test = pd.DataFrame(dataset['test'])

dataset_train['label'] = dataset_train[columns_to_concat].astype('float32').apply(list, axis=1)
dataset_train = dataset_train[['comment_text','label']]

dataset_validation['label'] = dataset_validation[columns_to_concat].astype('float32').apply(list, axis=1)
dataset_validation = dataset_validation[['comment_text','label']]

dataset_test['label'] = dataset_test[columns_to_concat].astype('float32').apply(list, axis=1)
dataset_test = dataset_test[['comment_text','label']]


In [37]:
# Converting the dataset to seperate lists to be fed into Custom Dataset function

train_text = dataset_train['comment_text'].to_list()
validation_text = dataset_validation['comment_text'].to_list()

train_labels = dataset_train['label'].to_list()
validation_labels = dataset_validation['label'].to_list()

In [38]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': label
    }

In [39]:
train_dataset = CustomDataset(train_text, train_labels, tokenizer)
val_dataset = CustomDataset(validation_text, validation_labels, tokenizer)

In [40]:
train_dataset[0]

{'input_ids': tensor([  101,  1000,  6203,  4441,  1012,  2428,  1010,  2038,  3158,  4063,
         20228, 11057,  3215,  2428,  2468,  1996,  8318, 18079,  2158,  1029,
          2057, 10657,  3066,  2007,  1000,  1000,  2339, 10643,  2066, 18930,
          2271,  2818,  6935,  5223,  1996,  2702,  3094,  8163,  1006,  1998,
          2293,  2157,  3358, 22889, 16446,  2066,  8129, 16371,  4757,  2571,
          1007,  1000,  1000, 10643,  1012,  1045,  2228, 18930,  2271,  2818,
          6935,  2003,  2931,  1010,  1998,  2763,  1010,  1037, 11690,  1012,
          1000,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [41]:
# Multi-Label Classification Evaluation Metrics
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
import torch


def multi_labels_metrics(predictions, labels, threshold=0.3):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs>=threshold)] = 1
  y_true = labels

  f1 = f1_score(y_true, y_pred, average = 'macro')
  roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
  hamming = hamming_loss(y_true, y_pred)

  metrics = {
      "roc_auc": roc_auc,
      "hamming_loss": hamming,
      "f1": f1
  }

  return metrics

def compute_metrics(p:EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  result = multi_labels_metrics(predictions=preds,
                                labels=p.label_ids)

  return result

In [42]:
# Training Arguments
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir = './results',
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=distilbert_model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/16225 [07:01<?, ?it/s]


In [43]:
trainer.train()

  3%|▎         | 500/16225 [04:33<2:24:13,  1.82it/s]

{'loss': 0.2223, 'grad_norm': 0.9518491625785828, 'learning_rate': 4.8459167950693376e-05, 'epoch': 0.15}


  6%|▌         | 1000/16225 [09:05<2:19:23,  1.82it/s]

{'loss': 0.1773, 'grad_norm': 1.6613024473190308, 'learning_rate': 4.691833590138675e-05, 'epoch': 0.31}


  9%|▉         | 1501/16225 [12:16<18:54, 12.97it/s]  

{'loss': 0.1687, 'grad_norm': 0.293109267950058, 'learning_rate': 4.537750385208013e-05, 'epoch': 0.46}


 12%|█▏        | 2000/16225 [12:55<18:46, 12.63it/s]

{'loss': 0.1631, 'grad_norm': 1.6971139907836914, 'learning_rate': 4.38366718027735e-05, 'epoch': 0.62}


 15%|█▌        | 2501/16225 [13:36<18:47, 12.18it/s]

{'loss': 0.1571, 'grad_norm': 1.1853710412979126, 'learning_rate': 4.229583975346688e-05, 'epoch': 0.77}


 18%|█▊        | 3000/16225 [14:23<21:14, 10.38it/s]

{'loss': 0.1617, 'grad_norm': 2.029637336730957, 'learning_rate': 4.0755007704160245e-05, 'epoch': 0.92}


 22%|██▏       | 3501/16225 [15:12<23:11,  9.14it/s]

{'loss': 0.1433, 'grad_norm': 1.286690592765808, 'learning_rate': 3.9214175654853626e-05, 'epoch': 1.08}


 25%|██▍       | 4000/16225 [16:05<22:04,  9.23it/s]

{'loss': 0.134, 'grad_norm': 1.2204653024673462, 'learning_rate': 3.767334360554699e-05, 'epoch': 1.23}


 28%|██▊       | 4501/16225 [16:59<21:48,  8.96it/s]  

{'loss': 0.1286, 'grad_norm': 1.5439109802246094, 'learning_rate': 3.6132511556240374e-05, 'epoch': 1.39}


 31%|███       | 5000/16225 [17:53<20:28,  9.14it/s]

{'loss': 0.1264, 'grad_norm': 0.7801806926727295, 'learning_rate': 3.459167950693374e-05, 'epoch': 1.54}


 34%|███▍      | 5501/16225 [18:48<19:40,  9.09it/s]  

{'loss': 0.1319, 'grad_norm': 1.1786428689956665, 'learning_rate': 3.305084745762712e-05, 'epoch': 1.69}


 37%|███▋      | 6000/16225 [19:40<17:30,  9.73it/s]

{'loss': 0.1282, 'grad_norm': 2.165217399597168, 'learning_rate': 3.1510015408320495e-05, 'epoch': 1.85}


 40%|████      | 6502/16225 [20:33<16:21,  9.90it/s]

{'loss': 0.1269, 'grad_norm': 0.45108190178871155, 'learning_rate': 2.996918335901387e-05, 'epoch': 2.0}


 43%|████▎     | 7000/16225 [21:25<16:09,  9.52it/s]

{'loss': 0.1, 'grad_norm': 1.1562610864639282, 'learning_rate': 2.842835130970724e-05, 'epoch': 2.16}


 46%|████▌     | 7501/16225 [22:18<15:06,  9.62it/s]  

{'loss': 0.0946, 'grad_norm': 1.404702067375183, 'learning_rate': 2.6887519260400617e-05, 'epoch': 2.31}


 49%|████▉     | 8000/16225 [23:11<14:10,  9.67it/s]

{'loss': 0.0959, 'grad_norm': 3.2935972213745117, 'learning_rate': 2.5346687211093994e-05, 'epoch': 2.47}


 52%|█████▏    | 8501/16225 [24:04<14:04,  9.15it/s]

{'loss': 0.1018, 'grad_norm': 1.0218685865402222, 'learning_rate': 2.3805855161787368e-05, 'epoch': 2.62}


 55%|█████▌    | 9000/16225 [24:57<12:27,  9.66it/s]

{'loss': 0.0982, 'grad_norm': 1.241233468055725, 'learning_rate': 2.226502311248074e-05, 'epoch': 2.77}


 59%|█████▊    | 9501/16225 [25:51<11:38,  9.63it/s]

{'loss': 0.0944, 'grad_norm': 1.5369213819503784, 'learning_rate': 2.0724191063174115e-05, 'epoch': 2.93}


 62%|██████▏   | 10000/16225 [26:44<11:03,  9.38it/s]

{'loss': 0.0815, 'grad_norm': 1.8241850137710571, 'learning_rate': 1.918335901386749e-05, 'epoch': 3.08}


 65%|██████▍   | 10501/16225 [27:39<09:35,  9.95it/s]

{'loss': 0.0621, 'grad_norm': 0.8407554030418396, 'learning_rate': 1.7642526964560863e-05, 'epoch': 3.24}


 68%|██████▊   | 11000/16225 [28:32<09:10,  9.49it/s]

{'loss': 0.0625, 'grad_norm': 1.6353659629821777, 'learning_rate': 1.6101694915254237e-05, 'epoch': 3.39}


 71%|███████   | 11501/16225 [29:26<08:38,  9.11it/s]

{'loss': 0.0657, 'grad_norm': 0.7621586322784424, 'learning_rate': 1.4560862865947614e-05, 'epoch': 3.54}


 74%|███████▍  | 12000/16225 [30:19<07:38,  9.21it/s]

{'loss': 0.0625, 'grad_norm': 1.922707438468933, 'learning_rate': 1.3020030816640988e-05, 'epoch': 3.7}


 77%|███████▋  | 12501/16225 [31:13<06:44,  9.20it/s]

{'loss': 0.0658, 'grad_norm': 2.1676788330078125, 'learning_rate': 1.147919876733436e-05, 'epoch': 3.85}


 80%|████████  | 13000/16225 [32:07<05:42,  9.43it/s]

{'loss': 0.0622, 'grad_norm': 3.2714650630950928, 'learning_rate': 9.938366718027735e-06, 'epoch': 4.01}


 83%|████████▎ | 13502/16225 [33:02<04:34,  9.93it/s]

{'loss': 0.0386, 'grad_norm': 0.9753122329711914, 'learning_rate': 8.397534668721111e-06, 'epoch': 4.16}


 86%|████████▋ | 14000/16225 [33:55<03:59,  9.29it/s]

{'loss': 0.0381, 'grad_norm': 0.1244005635380745, 'learning_rate': 6.856702619414485e-06, 'epoch': 4.31}


 89%|████████▉ | 14501/16225 [34:49<03:09,  9.10it/s]

{'loss': 0.0358, 'grad_norm': 2.411036491394043, 'learning_rate': 5.315870570107859e-06, 'epoch': 4.47}


 92%|█████████▏| 15000/16225 [35:43<02:13,  9.18it/s]

{'loss': 0.0424, 'grad_norm': 1.5960100889205933, 'learning_rate': 3.775038520801233e-06, 'epoch': 4.62}


 96%|█████████▌| 15501/16225 [36:38<01:19,  9.09it/s]

{'loss': 0.0357, 'grad_norm': 7.48186731338501, 'learning_rate': 2.234206471494607e-06, 'epoch': 4.78}


 99%|█████████▊| 16000/16225 [37:31<00:23,  9.72it/s]

{'loss': 0.036, 'grad_norm': 1.7445204257965088, 'learning_rate': 6.933744221879816e-07, 'epoch': 4.93}


100%|██████████| 16225/16225 [37:57<00:00,  7.13it/s]

{'train_runtime': 2277.0361, 'train_samples_per_second': 57.004, 'train_steps_per_second': 7.125, 'train_loss': 0.10047955488019805, 'epoch': 5.0}





TrainOutput(global_step=16225, training_loss=0.10047955488019805, metrics={'train_runtime': 2277.0361, 'train_samples_per_second': 57.004, 'train_steps_per_second': 7.125, 'train_loss': 0.10047955488019805, 'epoch': 5.0})