Installing the required Libraries

In [1]:
!pip install --upgrade transformers
!pip install datasets scikit-learn torch -q



Importing the required libraries

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset

Load the datasets from Hugging Face

In [4]:
dataset = load_dataset("civil_comments",split="train")

Here I am loading 3000 data only to  reduce the moddel training time by saving some space.

In [5]:
small_dataset = dataset.select(range(3000))

Converting into Dataframe to visualize the data

In [6]:
df = small_dataset.to_pandas()

In [7]:
df=df[df["toxicity"]>=.5]

In [8]:
df

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
4,haha you guys are a bunch of losers.,0.893617,0.021277,0.000000,0.0,0.872340,0.021277,0.000000
5,ur a sh*tty comment.,0.666667,0.047619,0.638095,0.0,0.333333,0.000000,0.009524
13,It's ridiculous that these guys are being call...,0.600000,0.000000,0.100000,0.1,0.600000,0.000000,0.000000
14,This story gets more ridiculous by the hour! A...,0.500000,0.000000,0.000000,0.0,0.300000,0.000000,0.000000
19,"Angry trolls, misogynists and Racists"", oh my....",0.500000,0.000000,0.000000,0.0,0.500000,0.100000,0.000000
...,...,...,...,...,...,...,...,...
2964,"In his letter, Cook also makes the entirely co...",0.600000,0.000000,0.200000,0.0,0.600000,0.000000,0.000000
2974,Every government is prone to tyranny unless th...,0.700000,0.000000,0.000000,0.2,0.200000,0.700000,0.000000
2989,"""40-20\n12 hours ago\nLabeling others with the...",0.600000,0.000000,0.000000,0.0,0.500000,0.500000,0.000000
2991,"""This law subverts our ability to be watchful....",0.500000,0.000000,0.100000,0.2,0.400000,0.200000,0.000000


Test-Train Splitting

In [9]:
dataset = small_dataset.train_test_split(test_size=0.2)

Add binary Label Column

In [10]:
def add_label(row):
    row["label"] = 1 if row["toxicity"] >= 0.5 else 0
    return row

dataset = dataset.map(add_label)

Map: 100%|██████████| 2400/2400 [00:00<00:00, 7520.22 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 15951.06 examples/s]


In [11]:
df_train = dataset['train'].to_pandas()
df_test = dataset['test'].to_pandas()

In [12]:
df_train_1=df_train[df_train["label"]==1]

In [13]:
df_train_1

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit,label
10,"I am not concerned about the ""privacy, persona...",0.600000,0.000000,0.000000,0.200000,0.500000,0.300000,0.0,1
22,Took this as an opportunity to check back in o...,0.700000,0.100000,0.000000,0.000000,0.700000,0.000000,0.0,1
42,"Tim: ""...randomizing who gets what comment on ...",0.500000,0.000000,0.000000,0.000000,0.500000,0.300000,0.0,1
46,The Liars Anonymous meeting today will be at 4...,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,1
68,"Well shit, they drafted a guide. We should al...",0.818182,0.045455,0.772727,0.015152,0.287879,0.000000,0.0,1
...,...,...,...,...,...,...,...,...,...
2364,Yet call out all Muslims for the acts of a few...,0.912500,0.050000,0.237500,0.112500,0.887500,0.612500,0.0,1
2366,If I recall correctly did not LTD just announc...,0.500000,0.000000,0.300000,0.000000,0.500000,0.000000,0.0,1
2373,"Considering the real facts of this issue, I be...",0.700000,0.000000,0.300000,0.100000,0.600000,0.000000,0.0,1
2386,"They are all equally odious , not for the view...",0.750000,0.016667,0.133333,0.000000,0.716667,0.116667,0.0,1


Tokenize the text-
DistilBERT tokenizer to tokenize the inputs is used. tuncate=true, padding=True and max_Length=128 to maintain the length of the input list to max-length by truncating the long length list and padding with extra 0 to shorter length list.

In [14]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(row):
    return tokenizer(row["text"], truncation=True, padding=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

Map: 100%|██████████| 2400/2400 [00:00<00:00, 18859.14 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 19421.82 examples/s]


Loading the pretrained "distilbert-base-uncased" model

In [15]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define the training arguments

In [16]:
from transformers import TrainingArguments

In [17]:
training_args = TrainingArguments(
    output_dir="./results", 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
)

Define the evaluation metrics

In [18]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1 = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

Train the model

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.1709


TrainOutput(global_step=600, training_loss=0.161343518892924, metrics={'train_runtime': 59.8595, 'train_samples_per_second': 80.188, 'train_steps_per_second': 10.023, 'total_flos': 158960878387200.0, 'train_loss': 0.161343518892924, 'epoch': 2.0})

Evaluation

In [20]:
import torch

# Force CPU for inference
device = torch.device("cpu")
model.to(device)

# Function to classify a user prompt as "safe" or "unsafe"
def classify_prompt(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    prediction = outputs.logits.argmax(dim=-1).item()
    return "unsafe" if prediction == 1 else "safe"


prompts = [
    "You're a loser person!",
    "I need help with my order.",
    "You're trash and no one likes you.",
    "Can you assist me with my laptop?",
    
]

for p in prompts:
    print(f"{p} --> {classify_prompt(p)}")

You're a loser person! --> unsafe
I need help with my order. --> safe
You're trash and no one likes you. --> unsafe
Can you assist me with my laptop? --> safe
