In [41]:
import torch
torch.cuda.is_available()
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import evaluate
import accelerate
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


In [42]:
train_dataset = Dataset.from_file(r"processed_dataset\train\data-00000-of-00001.arrow")
train_dataset = train_dataset.remove_columns('cyberbullying')
train_dataset[0]

{'id': '0000997932d777bf',
 'comment_text': "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

In [43]:
test_dataset = Dataset.from_file(r"processed_dataset\test\data-00000-of-00001.arrow")
test_dataset = test_dataset.remove_columns('cyberbullying')
test_dataset[0]

{'id': '0001ea8717f6de06',
 'comment_text': 'Thank you for understanding. I think very highly of you and would not revert without discussion.',
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

In [44]:
label_map = {'toxic':0, 'severe_toxic':1, 'obscene':2, 'threat':3, 'insult':4, 'identity_hate':5}

In [45]:
print(set(train_dataset['toxic']))

{0, 1}


In [46]:
"""
Creating a labels column for multi-label classification (instead of multi-class classification)
"""
def create_multi_label(example):
    return {"labels": [np.float32(example[label]) for label in label_map.keys()]}

train_dataset = train_dataset.map(create_multi_label).remove_columns(list(label_map.keys()))
test_dataset = test_dataset.map(create_multi_label).remove_columns(list(label_map.keys()))
train_dataset[0]

Map: 100%|██████████| 159571/159571 [00:11<00:00, 13657.37 examples/s]
Map: 100%|██████████| 63978/63978 [00:04<00:00, 14468.09 examples/s]


{'id': '0000997932d777bf',
 'comment_text': "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}

In [47]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"  # Or any other BERT variant
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    tokenized_inputs = tokenizer(
        examples["comment_text"], truncation=True, padding="max_length", max_length=128
    )
    return tokenized_inputs

def cast_labels_to_float32(example):
    example["labels"] = [np.float32(label) for label in example["labels"]]
    return example
# Apply the preprocessing function
tokenized_train_datasets = train_dataset.map(preprocess_function, batched=True)
#tokenized_train_datasets = tokenized_train_datasets.map(cast_labels_to_float32, batched=True)
tokenized_train_datasets = tokenized_train_datasets.remove_columns(["id", "comment_text",])

#tokenized_train_datasets.set_format("torch")

print(tokenized_train_datasets)


Map: 100%|██████████| 159571/159571 [00:13<00:00, 11599.80 examples/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 159571
})





In [48]:
tokenized_train_datasets[0]['labels']

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

<h3>BCEWithLogitsLoss ()</h3><p> function which combines a Sigmoid layer and the BCELoss in one single class instead of having a plain Sigmoid followed by a BCELoss.</p>

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np

# 1. Load the BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map.keys()), hidden_dropout_prob=0.1)
model.resize_token_embeddings(len(tokenizer)) # need to resize due to new tokens added
problem_type  = "multi_label_classification"
model.config.problem_type = problem_type
# 2. Define training arguments
metric_name = 'f1'
# model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"./snapshots/{model_name}-finetuned",
    evaluation_strategy="no",  # No evaluation since there's no validation dataset
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=False,
    fp16=True
)

# 3. Define a metric to compute during training
metric = evaluate.load(metric_name)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return metric.compute(predictions=predictions.float(), references=labels, average="micro")

# 4. Create a Trainer instance
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    #eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 5. Train the model
train_log = trainer.train()

trainer.save_model(f"./models/BERT_Multi-Label_classification") # saving the model


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
max_length = tokenizer.model_max_length # maximum sequence length for tokenizing the input text
max_length

512

In [None]:
# Finally, we perform our evaluation on our test set using the fine-tuned model from earlier.
classifier = pipeline(problem_type, model=model, tokenizer=tokenizer, device="cuda:0")
results = classifier(test_dataset['comment_text'], max_length=max_length, padding="max_length", truncation=True)
dfResults = pd.DataFrame.from_dict(results)
dfResults['labels'] = dfResults['labels'].str.replace('LABEL_','')
f1 = metric.compute(predictions=dfResults['labels'].tolist(), references=test_dataset['labels'], average='micro')
print(f1)