In [1]:
!nvidia-smi

Sun Apr 14 00:07:43 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 552.12                 Driver Version: 552.12         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   58C    P5             17W /   30W |     898MiB /   6144MiB |      4%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Importing the libraries
from datasets import load_dataset
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset

from transformers import DistilBertForSequenceClassification, AutoTokenizer, AutoConfig # ForSequenceClassification part adds the sequence classification head to the distilbert model

from tqdm import tqdm
import torch
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Device check if GPU is available for not
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
# Dataset loading

dataset = load_dataset("Arsive/toxicity_classification_jigsaw")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 25960
    })
    validation: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 6490
    })
    test: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 153164
    })
})

In [5]:
# Instantiating the tokenizer and model

checkpoint = "distilbert/distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
distilbert_model = DistilBertForSequenceClassification.from_pretrained(checkpoint,num_labels=6,problem_type="multi_label_classification") # AutoModelForSequenceClassification does the same thing as well (adding sequence classification head)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
#Dataset Info
split_lengths = [len(dataset[split])for split in dataset]
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

print(f"Split lengths: {split_lengths}") # Record size of Train , Test and Validation datasets
print(f"Features: {dataset['train'].column_names}")
print("\nComment:")

print(dataset["train"][1]["comment_text"])

print("\nLabel values")

for x in label_columns:
    value = {dataset["train"][0][x]}
    print(f"{x} --> {value}")

Split lengths: [25960, 6490, 153164]
Features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

Comment:
You NDP attack queers, sorry, Cabal of Sanctimonious Hypocrites, will go to any length to keep wikipedia as your own personal revisionist paradise, won't you?

Label values
toxic --> {1}
severe_toxic --> {0}
obscene --> {1}
threat --> {0}
insult --> {1}
identity_hate --> {1}


In [7]:
# Getting the configurations of the model

config = AutoConfig.from_pretrained(checkpoint)

# Print the configuration
print(config)

DistilBertConfig {
  "_name_or_path": "distilbert/distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.39.3",
  "vocab_size": 30522
}



In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 25960
    })
    validation: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 6490
    })
    test: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 153164
    })
})

In [9]:
# Concatenating the label columns to get a single column with the label values
columns_to_concat = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

dataset_train = pd.DataFrame(dataset['train'])
dataset_validation = pd.DataFrame(dataset['validation'])
dataset_test = pd.DataFrame(dataset['test'])

dataset_train['label'] = dataset_train[columns_to_concat].astype('float32').apply(list, axis=1)
dataset_train = dataset_train[['comment_text','label']]

dataset_validation['label'] = dataset_validation[columns_to_concat].astype('float32').apply(list, axis=1)
dataset_validation = dataset_validation[['comment_text','label']]

dataset_test['label'] = dataset_test[columns_to_concat].astype('float32').apply(list, axis=1)
dataset_test = dataset_test[['comment_text','label']]


In [10]:
# Converting the dataset to seperate lists to be fed into Custom Dataset function

train_text = dataset_train['comment_text'].to_list()
validation_text = dataset_validation['comment_text'].to_list()

train_labels = dataset_train['label'].to_list()
validation_labels = dataset_validation['label'].to_list()

In [11]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': label
    }

In [12]:
train_dataset = CustomDataset(train_text, train_labels, tokenizer)
val_dataset = CustomDataset(validation_text, validation_labels, tokenizer)

In [13]:
train_dataset[0]

{'input_ids': tensor([  101,  1000,  6203,  4441,  1012,  2428,  1010,  2038,  3158,  4063,
         20228, 11057,  3215,  2428,  2468,  1996,  8318, 18079,  2158,  1029,
          2057, 10657,  3066,  2007,  1000,  1000,  2339, 10643,  2066, 18930,
          2271,  2818,  6935,  5223,  1996,  2702,  3094,  8163,  1006,  1998,
          2293,  2157,  3358, 22889, 16446,  2066,  8129, 16371,  4757,  2571,
          1007,  1000,  1000, 10643,  1012,  1045,  2228, 18930,  2271,  2818,
          6935,  2003,  2931,  1010,  1998,  2763,  1010,  1037, 11690,  1012,
          1000,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [14]:
# Multi-Label Classification Evaluation Metrics
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
import torch


def multi_labels_metrics(predictions, labels, threshold=0.3):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs>=threshold)] = 1
  y_true = labels

  f1 = f1_score(y_true, y_pred, average = 'macro')
  roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
  hamming = hamming_loss(y_true, y_pred)

  metrics = {
      "roc_auc": roc_auc,
      "hamming_loss": hamming,
      "f1": f1
  }

  return metrics

def compute_metrics(p:EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  result = multi_labels_metrics(predictions=preds,
                                labels=p.label_ids)

  return result

In [15]:
# Training Arguments
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir = './results',
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=distilbert_model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
trainer.train()

  3%|▎         | 501/16225 [03:13<21:25, 12.23it/s]  

{'loss': 0.2252, 'grad_norm': 1.2079098224639893, 'learning_rate': 4.8459167950693376e-05, 'epoch': 0.15}


  6%|▌         | 1000/16225 [03:56<21:01, 12.07it/s]

{'loss': 0.1774, 'grad_norm': 1.0677144527435303, 'learning_rate': 4.691833590138675e-05, 'epoch': 0.31}


  9%|▉         | 1501/16225 [04:44<24:41,  9.94it/s]

{'loss': 0.1692, 'grad_norm': 0.26222068071365356, 'learning_rate': 4.537750385208013e-05, 'epoch': 0.46}


 12%|█▏        | 2000/16225 [05:37<26:38,  8.90it/s]

{'loss': 0.1615, 'grad_norm': 1.2673817873001099, 'learning_rate': 4.38366718027735e-05, 'epoch': 0.62}


 15%|█▌        | 2501/16225 [06:36<26:24,  8.66it/s]  

{'loss': 0.1581, 'grad_norm': 1.3051376342773438, 'learning_rate': 4.229583975346688e-05, 'epoch': 0.77}


 18%|█▊        | 3000/16225 [07:34<25:12,  8.75it/s]

{'loss': 0.1618, 'grad_norm': 1.6808058023452759, 'learning_rate': 4.0755007704160245e-05, 'epoch': 0.92}


 22%|██▏       | 3501/16225 [08:33<24:41,  8.59it/s]  

{'loss': 0.1444, 'grad_norm': 2.0300498008728027, 'learning_rate': 3.9214175654853626e-05, 'epoch': 1.08}


 25%|██▍       | 4000/16225 [09:30<24:12,  8.41it/s]

{'loss': 0.1341, 'grad_norm': 1.322433590888977, 'learning_rate': 3.767334360554699e-05, 'epoch': 1.23}


 28%|██▊       | 4501/16225 [10:28<24:01,  8.13it/s]  

{'loss': 0.1306, 'grad_norm': 1.3916475772857666, 'learning_rate': 3.6132511556240374e-05, 'epoch': 1.39}


 31%|███       | 5000/16225 [11:25<19:48,  9.44it/s]

{'loss': 0.1249, 'grad_norm': 0.625728964805603, 'learning_rate': 3.459167950693374e-05, 'epoch': 1.54}


 34%|███▍      | 5501/16225 [12:26<19:53,  8.99it/s]  

{'loss': 0.1338, 'grad_norm': 1.491707444190979, 'learning_rate': 3.305084745762712e-05, 'epoch': 1.69}


 37%|███▋      | 6000/16225 [13:22<18:54,  9.02it/s]

{'loss': 0.1298, 'grad_norm': 3.0884759426116943, 'learning_rate': 3.1510015408320495e-05, 'epoch': 1.85}


 40%|████      | 6501/16225 [14:18<17:30,  9.26it/s]  

{'loss': 0.1293, 'grad_norm': 0.40606817603111267, 'learning_rate': 2.996918335901387e-05, 'epoch': 2.0}


 43%|████▎     | 7000/16225 [16:08<12:18, 12.49it/s]  

{'loss': 0.1001, 'grad_norm': 1.530716896057129, 'learning_rate': 2.842835130970724e-05, 'epoch': 2.16}


 46%|████▌     | 7502/16225 [16:55<15:36,  9.31it/s]

{'loss': 0.096, 'grad_norm': 1.716856837272644, 'learning_rate': 2.6887519260400617e-05, 'epoch': 2.31}


 49%|████▉     | 8000/16225 [17:50<15:21,  8.92it/s]

{'loss': 0.0985, 'grad_norm': 2.0505523681640625, 'learning_rate': 2.5346687211093994e-05, 'epoch': 2.47}


 52%|█████▏    | 8501/16225 [18:50<15:58,  8.06it/s]

{'loss': 0.1018, 'grad_norm': 1.7485054731369019, 'learning_rate': 2.3805855161787368e-05, 'epoch': 2.62}


 55%|█████▌    | 9000/16225 [19:50<15:19,  7.86it/s]

{'loss': 0.099, 'grad_norm': 0.9836134314537048, 'learning_rate': 2.226502311248074e-05, 'epoch': 2.77}


 59%|█████▊    | 9501/16225 [20:51<13:28,  8.31it/s]

{'loss': 0.0985, 'grad_norm': 1.7329519987106323, 'learning_rate': 2.0724191063174115e-05, 'epoch': 2.93}


 62%|██████▏   | 10000/16225 [21:51<11:18,  9.18it/s]

{'loss': 0.0794, 'grad_norm': 1.087462067604065, 'learning_rate': 1.918335901386749e-05, 'epoch': 3.08}


 65%|██████▍   | 10501/16225 [22:54<09:32, 10.00it/s]

{'loss': 0.0645, 'grad_norm': 0.6319929957389832, 'learning_rate': 1.7642526964560863e-05, 'epoch': 3.24}


 68%|██████▊   | 11000/16225 [23:56<11:43,  7.43it/s]

{'loss': 0.0636, 'grad_norm': 0.9072849750518799, 'learning_rate': 1.6101694915254237e-05, 'epoch': 3.39}


 71%|███████   | 11501/16225 [24:56<08:58,  8.77it/s]

{'loss': 0.0645, 'grad_norm': 0.15807214379310608, 'learning_rate': 1.4560862865947614e-05, 'epoch': 3.54}


 74%|███████▍  | 12000/16225 [25:52<07:56,  8.87it/s]

{'loss': 0.0626, 'grad_norm': 2.2109804153442383, 'learning_rate': 1.3020030816640988e-05, 'epoch': 3.7}


 77%|███████▋  | 12501/16225 [26:50<07:35,  8.18it/s]

{'loss': 0.0676, 'grad_norm': 2.0240581035614014, 'learning_rate': 1.147919876733436e-05, 'epoch': 3.85}


 80%|████████  | 13000/16225 [27:45<05:42,  9.42it/s]

{'loss': 0.0656, 'grad_norm': 2.7640774250030518, 'learning_rate': 9.938366718027735e-06, 'epoch': 4.01}


 83%|████████▎ | 13501/16225 [28:41<05:17,  8.57it/s]

{'loss': 0.039, 'grad_norm': 0.29039010405540466, 'learning_rate': 8.397534668721111e-06, 'epoch': 4.16}


 86%|████████▋ | 14000/16225 [29:34<03:56,  9.40it/s]

{'loss': 0.0392, 'grad_norm': 0.22375214099884033, 'learning_rate': 6.856702619414485e-06, 'epoch': 4.31}


 89%|████████▉ | 14501/16225 [30:30<03:29,  8.21it/s]

{'loss': 0.0372, 'grad_norm': 4.048159599304199, 'learning_rate': 5.315870570107859e-06, 'epoch': 4.47}


 92%|█████████▏| 15000/16225 [31:31<02:28,  8.25it/s]

{'loss': 0.0428, 'grad_norm': 1.2015440464019775, 'learning_rate': 3.775038520801233e-06, 'epoch': 4.62}


 96%|█████████▌| 15501/16225 [32:39<01:44,  6.91it/s]

{'loss': 0.0374, 'grad_norm': 5.134284496307373, 'learning_rate': 2.234206471494607e-06, 'epoch': 4.78}


 99%|█████████▊| 16000/16225 [33:44<00:31,  7.07it/s]

{'loss': 0.0378, 'grad_norm': 3.480083703994751, 'learning_rate': 6.933744221879816e-07, 'epoch': 4.93}


100%|██████████| 16225/16225 [34:14<00:00,  7.90it/s]

{'train_runtime': 2054.5005, 'train_samples_per_second': 63.178, 'train_steps_per_second': 7.897, 'train_loss': 0.10146519818181066, 'epoch': 5.0}





TrainOutput(global_step=16225, training_loss=0.10146519818181066, metrics={'train_runtime': 2054.5005, 'train_samples_per_second': 63.178, 'train_steps_per_second': 7.897, 'train_loss': 0.10146519818181066, 'epoch': 5.0})

In [17]:
# Evaluating the model
trainer.evaluate()

100%|██████████| 812/812 [00:33<00:00, 24.36it/s]


{'eval_loss': 0.06604410707950592,
 'eval_roc_auc': 0.9544942243759232,
 'eval_hamming_loss': 0.020005136106831023,
 'eval_f1': 0.8849084514820489,
 'eval_runtime': 33.3751,
 'eval_samples_per_second': 194.456,
 'eval_steps_per_second': 24.33,
 'epoch': 5.0}

In [18]:
# Saving the model
trainer.save_model("distilbert_full_finetuning_toxic_comments")

In [24]:
# Using model for some testing data

test_string = dataset_test['comment_text'][10]

encoding = tokenizer(test_string,return_tensors='pt')
encoding.to(trainer.model.device)

outputs = trainer.model(**encoding) # Returning the logits as output

In [25]:
test_string

'I think its crap that the link to roggenbier is to this article. Somebody that knows how to do things should change it.'

In [26]:
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(outputs.logits[0].cpu()) # This part needs to be present in CPU
preds = np.zeros(probs.shape)
preds[np.where(probs>=0.3)] = 1 # 0.3 is the threshold value choosen during evaluation metrics
preds


array([1., 0., 1., 0., 0., 0.])

In [32]:
prediction = pd.DataFrame(preds,columns_to_concat,columns=['Prediction'])
prediction

Unnamed: 0,Prediction
toxic,1.0
severe_toxic,0.0
obscene,1.0
threat,0.0
insult,0.0
identity_hate,0.0
