In [1]:
!pip install transformers datasets torch accelerate peft


Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-1

In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("/kaggle/input/cyber-bullying-new/Approach to Social Media Cyberbullying and Harassment Detection Using Advanced Machine Learning.csv")

# Show dataset info
print("Columns in dataset:", df.columns.tolist())
print("\nFirst 5 rows:")
print(df.head())

# If there is a column with messages/text, display a few random samples
if 'text' in df.columns:
    print("\nSample messages:")
    print(df['text'].sample(5).to_list())
elif 'message' in df.columns:
    print("\nSample messages:")
    print(df['message'].sample(5).to_list())
else:
    print("\nNo 'text' or 'message' column found. Check actual column names above.")


Columns in dataset: ['Text', 'Label', 'Types']

First 5 rows:
                                                Text           Label  \
0  Ten outside soon doctor shake everyone treatme...    Not-Bullying   
1  my life has come to a standstill and at this p...    Not-Bullying   
2         girl this nigga make me sick to my stomach        Bullying   
3                                   I wanna fuck you        Bullying   
4  Oh hey, you should be ashamed of your disgusti...  Not - Bullying   

       Types  
0        NaN  
1        NaN  
2  Ethnicity  
3     Sexual  
4        NaN  

No 'text' or 'message' column found. Check actual column names above.


In [3]:
from sklearn.model_selection import train_test_split
# Keep only the two useful columns
df = df[['Text', 'Label']]

# Clean and encode labels
df['Label'] = df['Label'].str.strip().str.lower()   # remove spaces + lowercase
df['label_id'] = df['Label'].map({'bullying': 1, 'not-bullying': 0})

# Drop rows with missing values
df = df.dropna(subset=['Text', 'label_id'])
df['label_id'] = df['label_id'].astype(int)

# Print dataset info
print("Unique labels:", df['Label'].unique())
print("Label distribution:\n", df['label_id'].value_counts())

# Split dataset (80/10/10)
train_df, temp_df = train_test_split(
    df, 
    test_size=0.2, 
    stratify=df['label_id'], 
    random_state=42
)
dev_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    stratify=temp_df['label_id'], 
    random_state=42
)

print(f"\nTrain: {len(train_df)}, Dev: {len(dev_df)}, Test: {len(test_df)}")

Unique labels: ['not-bullying' 'bullying']
Label distribution:
 label_id
1    4826
0    3004
Name: count, dtype: int64

Train: 6264, Dev: 783, Test: 783


In [3]:
# ==========================
# Model Development - DeBERTa V3 Small (Fast Debug Version)
# ==========================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, logging

# ------------------------
# 1. Load and preprocess
# ------------------------
df = pd.read_csv("/kaggle/input/cyber-bullying-new/Approach to Social Media Cyberbullying and Harassment Detection Using Advanced Machine Learning.csv")

# Keep only needed columns
df = df[['Text', 'Label']]
df['Label'] = df['Label'].str.strip().str.lower()

# Map to numeric labels
df['label'] = df['Label'].map({'bullying': 1, 'not-bullying': 0})
df = df.dropna(subset=['Text', 'label'])
df['label'] = df['label'].astype(int)

# Split dataset (80/10/10 stratified)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f"Train: {len(train_df)}, Dev: {len(dev_df)}, Test: {len(test_df)}")

# Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df)
dev_ds = Dataset.from_pandas(dev_df)
test_ds = Dataset.from_pandas(test_df)

# ------------------------
# 2. Tokenization
# ------------------------
model_name = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["Text"], padding="max_length", truncation=True, max_length=128)  # reduced max_length for speed

train_ds = train_ds.map(tokenize, batched=True)
dev_ds = dev_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
dev_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# ------------------------
# 3. Model
# ------------------------
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# ------------------------
# 4. Metrics (scikit-learn)
# ------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# ------------------------
# 5. Training
# ------------------------
logging.set_verbosity_info()  # ensure logs are shown

training_args = TrainingArguments(
    output_dir="./deberta_results",
    do_eval=True,
    per_device_train_batch_size=32,   # faster with bigger batch
    per_device_eval_batch_size=32,
    num_train_epochs=50,               # just 1 epoch for quick run
    learning_rate=5e-5,               # slightly higher LR for faster learning
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,                 # frequent logging
    save_total_limit=1,
    report_to="none",                 # no external trackers (wandb etc.)
    disable_tqdm=False                # enables progress bar
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# ------------------------
# 6. Evaluation
# ------------------------
metrics = trainer.evaluate(test_ds)
print("Test metrics:", metrics)


2025-09-16 01:40:19.097881: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757986819.325452      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757986819.393538      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Train: 6264, Dev: 783, Test: 783


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/6264 [00:00<?, ? examples/s]

Map:   0%|          | 0/783 [00:00<?, ? examples/s]

Map:   0%|          | 0/783 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
PyTorch: setting up devices
  trainer = Trainer(
Safetensors PR exists
The following columns in the Training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: Label, __index_level_0__, Text. If Label, __index_level_0__, Text are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6,264
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9,800
  Numb

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Step,Training Loss
10,0.6788
20,0.5763
30,0.4198
40,0.3646
50,0.3459
60,0.2476
70,0.3585
80,0.3183
90,0.2701
100,0.2627


Saving model checkpoint to ./deberta_results/checkpoint-500
Configuration saved in ./deberta_results/checkpoint-500/config.json
Model weights saved in ./deberta_results/checkpoint-500/model.safetensors
tokenizer config file saved in ./deberta_results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./deberta_results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./deberta_results/checkpoint-1000
Configuration saved in ./deberta_results/checkpoint-1000/config.json
Model weights saved in ./deberta_results/checkpoint-1000/model.safetensors
tokenizer config file saved in ./deberta_results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./deberta_results/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [deberta_results/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to ./deberta_results/checkpoint-1500
Configuration saved in ./deberta_results/checkpoint-1500/config.json
Model weights saved in ./debe

Test metrics: {'eval_loss': 0.5906355381011963, 'eval_accuracy': 0.9361430395913155, 'eval_precision': 0.9366492703470523, 'eval_recall': 0.9361430395913155, 'eval_f1': 0.9362945898231654, 'eval_runtime': 1.9365, 'eval_samples_per_second': 404.331, 'eval_steps_per_second': 12.91, 'epoch': 50.0}


In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# Path to your trained checkpoint
model_path = "./deberta_results/checkpoint-9800"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Put model in evaluation mode
model.eval()

def predict(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    
    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        pred = torch.argmax(probs, dim=-1).item()

    return pred, probs.tolist()

# Example usage
while True:
    user_input = input("✍️ Enter your response (type 'x' to close): ")
    if user_input.lower() == 'x':
        print("🔒 Closed.")
        break
    
    label, probabilities = predict(user_input)
    
    # Assuming label mapping: 0 = Not Bullying, 1 = Bullying
    if label == 1:
        print(f"🚨 Prediction: Bullying (confidence: {probabilities[0][label]:.2f})")
    else:
        print(f"✅ Prediction: Not Bullying (confidence: {probabilities[0][label]:.2f})")


loading file spm.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
loading configuration file ./deberta_results/checkpoint-9800/config.json
Model config DebertaV2Config {
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,


✍️ Enter your response (type 'x' to close):  My phone is not working. Can you fix it?


✅ Prediction: Not Bullying (confidence: 0.99)


✍️ Enter your response (type 'x' to close):  I have my photo in my diary. Do you wanna see it?


🚨 Prediction: Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  What's the time?


✅ Prediction: Not Bullying (confidence: 0.91)


✍️ Enter your response (type 'x' to close):  I'd like to fly in the air.


✅ Prediction: Not Bullying (confidence: 0.97)


✍️ Enter your response (type 'x' to close):  This place is really beautiful. This is a old school. I am a regular visitor.


✅ Prediction: Not Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  This place is really beautiful. This is a brothel. I am a regular visitor.


✅ Prediction: Not Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  This is a beautiful hotel. 


✅ Prediction: Not Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  This is a beautiful hotel. Do you wanna come with me?


✅ Prediction: Not Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  This is a beautiful hotel. Do you wanna come with me? I wanna book a room for us and spend a couple of days with you.


✅ Prediction: Not Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  Do you wanna fun in the bedroom with me?


🚨 Prediction: Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  You have a nice car. When did you buy it?


✅ Prediction: Not Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  You have a nice hair style? Where did you make it?


✅ Prediction: Not Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  You have a nice hair style. It attracts me.


🚨 Prediction: Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  You  have nice cloths. May I know how do you look without them?


✅ Prediction: Not Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  You have nice clothes. May I know how do you look without them?


🚨 Prediction: Bullying (confidence: 0.96)


✍️ Enter your response (type 'x' to close):  You look amazing, but I don't like your face.


🚨 Prediction: Bullying (confidence: 0.90)


✍️ Enter your response (type 'x' to close):  Do you wanna drink coffee ?


🚨 Prediction: Bullying (confidence: 1.00)


✍️ Enter your response (type 'x' to close):  X


🔒 Closed.
