Installing the required Libraries

In [1]:
!pip install --upgrade transformers
!pip install datasets scikit-learn torch -q



Importing the required libraries

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset

Load the datasets from Hugging Face

In [4]:
dataset = load_dataset("civil_comments",split="train")

Here I am loading 3000 data only to  reduce the moddel training time by saving some space.

In [5]:
small_dataset = dataset.select(range(3000))

Converting into Dataframe to visualize the data

In [6]:
df = small_dataset.to_pandas()

In [7]:
df=df[df["toxicity"]>=.5]

In [8]:
df

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
4,haha you guys are a bunch of losers.,0.893617,0.021277,0.000000,0.0,0.872340,0.021277,0.000000
5,ur a sh*tty comment.,0.666667,0.047619,0.638095,0.0,0.333333,0.000000,0.009524
13,It's ridiculous that these guys are being call...,0.600000,0.000000,0.100000,0.1,0.600000,0.000000,0.000000
14,This story gets more ridiculous by the hour! A...,0.500000,0.000000,0.000000,0.0,0.300000,0.000000,0.000000
19,"Angry trolls, misogynists and Racists"", oh my....",0.500000,0.000000,0.000000,0.0,0.500000,0.100000,0.000000
...,...,...,...,...,...,...,...,...
2964,"In his letter, Cook also makes the entirely co...",0.600000,0.000000,0.200000,0.0,0.600000,0.000000,0.000000
2974,Every government is prone to tyranny unless th...,0.700000,0.000000,0.000000,0.2,0.200000,0.700000,0.000000
2989,"""40-20\n12 hours ago\nLabeling others with the...",0.600000,0.000000,0.000000,0.0,0.500000,0.500000,0.000000
2991,"""This law subverts our ability to be watchful....",0.500000,0.000000,0.100000,0.2,0.400000,0.200000,0.000000


Test-Train Splitting

In [9]:
dataset = small_dataset.train_test_split(test_size=0.2)

Add binary Label Column

In [10]:
def add_label(row):
    row["label"] = 1 if row["toxicity"] >= 0.5 else 0
    return row

dataset = dataset.map(add_label)

Map: 100%|██████████| 2400/2400 [00:00<00:00, 5970.88 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 13009.90 examples/s]


In [11]:
df_train = dataset['train'].to_pandas()
df_test = dataset['test'].to_pandas()

In [12]:
df_train_1=df_train[df_train["label"]==1]

In [13]:
df_train_1

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit,label
13,Silat you are a moron,0.903226,0.064516,0.145161,0.000000,0.887097,0.000000,0.0,1
62,Um. Bernie supporters attack. Thom Hartmann a...,0.500000,0.000000,0.000000,0.000000,0.500000,0.000000,0.0,1
96,Some people are so dumb that they will vote fo...,0.611940,0.000000,0.089552,0.000000,0.611940,0.000000,0.0,1
107,"""Anybody who can kiss that many asses, that qu...",0.600000,0.000000,0.500000,0.000000,0.500000,0.100000,0.5,1
117,"On its face, Glueck's claim that AG Rosenblum ...",0.500000,0.000000,0.000000,0.000000,0.500000,0.000000,0.0,1
...,...,...,...,...,...,...,...,...,...
2288,It's ridiculous that these guys are being call...,0.600000,0.000000,0.100000,0.100000,0.600000,0.000000,0.0,1
2291,This couple is just milking this....the city s...,0.800000,0.000000,0.500000,0.000000,0.300000,0.000000,0.3,1
2298,Where do you get the Sour Grapes to say this i...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.0,1
2318,Dear world:\nJust a reminder that none of thes...,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,1


Tokenize the text-
DistilBERT tokenizer to tokenize the inputs is used. tuncate=true, padding=True and max_Length=128 to maintain the length of the input list to max-length by truncating the long length list and padding with extra 0 to shorter length list.

In [14]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(row):
    return tokenizer(row["text"], truncation=True, padding=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

Map: 100%|██████████| 2400/2400 [00:00<00:00, 12543.90 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 15338.56 examples/s]


Loading the pretrained "distilbert-base-uncased" model

In [15]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define the training arguments

In [16]:
from transformers import TrainingArguments

In [17]:
training_args = TrainingArguments(
    output_dir="./results", 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
)

Define the evaluation metrics

In [18]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1 = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

Train the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Evaluation

In [None]:
import torch

# Force CPU for inference
device = torch.device("cpu")
model.to(device)

# Function to classify a user prompt as "safe" or "unsafe"
def classify_prompt(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    prediction = outputs.logits.argmax(dim=-1).item()
    return "unsafe" if prediction == 1 else "safe"


prompts = [
    "You're a loser person!",
    "I need help with my order.",
    "You're trash and no one likes you.",
    "Can you assist me with my laptop?",
    
]

for p in prompts:
    print(f"{p} --> {classify_prompt(p)}")