# A Single-Label NLP-Based Digital Forensic Framework for Bangla Social Media Content Analysis

**Authors:** Ashif Rabbani, Md. Sakib Muhtadee, Jannatul Ferdous

**Environment:** Google Colab

**Data Source:** Kaggle, Github

## 1) Multi Labeled Bengali Toxic Comments

IEEE: https://doi.org/10.1109/ECCE57851.2023.10101588

arXiv: https://arxiv.org/abs/2304.04087

ReserchGate: https://www.researchgate.net/publication/369924719_Interpretable_Multi_Labeled_Bengali_Toxic_Comments_Classification_using_Deep_Learning



In [7]:
import kagglehub
import os

# User-specified directory
#download_dir = "/home/sakib/Documents/MSC Project/contents"  # change as you like
#os.environ['KAGGLEHUB_CACHE'] = download_dir

# Download latest version to the specified path
path = kagglehub.dataset_download("tanveerbelaliut/multi-labeled-bengali-toxic-comments")
print("Dataset downloaded to:", path)


Using Colab cache for faster access to the 'multi-labeled-bengali-toxic-comments' dataset.
Dataset downloaded to: /kaggle/input/multi-labeled-bengali-toxic-comments


In [5]:
!pwd
import pandas as pd

multilabel_df = pd.read_csv("/kaggle/input/multi-labeled-bengali-toxic-comments/versions/1/Multi_labeled_toxic_comments.csv")
# list of label columns
label_cols = ['vulgar', 'hate', 'religious', 'threat', 'troll', 'Insult']
# create neutral column
multilabel_df['neutral'] = (multilabel_df[label_cols].sum(axis=1) == 0).astype(int)
multilabel_df = multilabel_df.drop(multilabel_df[multilabel_df['troll'] == 1].index)
multilabel_df = multilabel_df.drop(columns=['troll'])
multilabel_df.head(5)

/content


FileNotFoundError: [Errno 2] No such file or directory: './contents/datasets/tanveerbelaliut/multi-labeled-bengali-toxic-comments/versions/1/Multi_labeled_toxic_comments.csv'

In [None]:
import kagglehub
import os

# User-specified directory
#download_dir = "/home/sakib/Documents/MSC Project/contents"  # change as you like
#os.environ['KAGGLEHUB_CACHE'] = download_dir

# Download latest version to the specified path
path = kagglehub.dataset_download("tanveerbelaliut/multi-labeled-bengali-toxic-comments")
print("Dataset downloaded to:", path)


Using Colab cache for faster access to the 'multi-labeled-bengali-toxic-comments' dataset.
Dataset downloaded to: /kaggle/input/multi-labeled-bengali-toxic-comments


In [None]:
multilabel_df = multilabel_df.rename(columns={
    "vulgar": "toxic",
    "hate": "hate_speech",
    "religious": "harassment",
    "threat": "violence",
    "Insult": "cyberbullying"
})


label_cols = [
    "toxic",
    "hate_speech",
    "harassment",
    "violence",
    "cyberbullying",
    "neutral"
]

multilabel_df["label"] = multilabel_df[label_cols].idxmax(axis=1)

# Optional: drop old columns
multilabel_df= multilabel_df.drop(columns=label_cols)
multilabel_df.head(5)

Unnamed: 0,text,label
0,‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶®‡¶Æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßÄ ‡¶π‡¶ï ‡¶∏‡¶æ‡¶π‡ßá‡¶¨‡ßá‡¶∞ ‡¶ï‡ßç‡¶∑‡¶§‡¶ø ‡¶π‡¶≤‡ßá ‡¶ú‡¶æ‡¶§‡¶ø‡¶∞ ‡¶∏‡ßç‡¶¨‡¶æ‡¶∞...,violence
1,"‡¶Ü‡¶Æ‡¶ø ‡¶¨‡¶≤‡¶≤‡¶æ‡¶Æ, ‚Äò‡¶¶‡ßá‡¶®‚Äô",neutral
2,‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶§‡¶æ‡¶®‡¶ú‡¶ø‡¶® ‡¶§‡¶ø‡¶∂‡¶æ ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶¨‡¶æ‡¶≤‡ßã ‡¶≤‡¶æ‡¶ó‡¶æ‡¶∞ ‡¶è‡¶ï‡¶ú‡¶®‡¶ï‡¶æ‡¶§‡¶æ‡¶∞ ...,neutral
3,‡¶§‡¶æ‡¶∞ ‡¶â‡¶™‡¶∞ ‡ß® ‡¶ú‡¶® ‡¶Æ‡ßá‡ßü‡¶∞,neutral
4,‡¶™‡¶≤‡¶æ‡¶∂‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶è‡¶§‡ßã ‡¶≠‡¶æ‡¶≤‡ßã ‡¶π‡¶¨‡ßá ‡¶ï‡¶≤‡ßç‡¶™‡¶®‡¶æ‡¶ì ‡¶ï‡¶∞‡¶ø ‡¶®‡¶æ‡¶á ‡¶§‡ßå‡¶π‡¶ø‡¶¶‡ßá...,neutral


In [None]:
!pip install transformers datasets torch scikit-learn accelerate
import torch
import pandas as pd
from datasets import Dataset
from sklearn.metrics import f1_score, precision_score, recall_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [None]:
NUM_LABELS = len(multilabel_df["label"].iloc[0])
dataset = Dataset.from_pandas(multilabel_df)
dataset = dataset.rename_column("label", "labels")

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn

MODEL_NAME = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert = AutoModel.from_pretrained(MODEL_NAME)  # ‚úÖ works in Python 3.11

class MultiLabelBERT(nn.Module):
    def __init__(self, bert_model, num_labels):
        super().__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:,0]  # CLS token
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

NUM_LABELS = 6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiLabelBERT(bert, NUM_LABELS).to(device)



ValueError: Could not find BertModel neither in <module 'transformers.models.bert' from '/home/sakib/anaconda3/envs/torch_env/lib/python3.11/site-packages/transformers/models/bert/__init__.py'> nor in <module 'transformers' from '/home/sakib/anaconda3/envs/torch_env/lib/python3.11/site-packages/transformers/__init__.py'>!

In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

dataset = dataset.map(tokenize, batched=True)

In [None]:
dataset = dataset.train_test_split(test_size=0.1, seed=42)

train_ds = dataset["train"]
val_ds = dataset["test"]

train_ds.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "labels"]
)

val_ds.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "labels"]
)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int()

    return {
        "micro_f1": f1_score(labels, preds, average="micro"),
        "macro_f1": f1_score(labels, preds, average="macro"),
        "precision": precision_score(labels, preds, average="micro"),
        "recall": recall_score(labels, preds, average="micro")
    }


In [None]:
training_args = TrainingArguments(
    output_dir="./bangla_bert_multilabel",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # reduce if GPU < 8GB
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
    fp16=torch.cuda.is_available(),  # üî• mixed precision on GPU
    report_to="none"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
def predict(text, threshold=0.5):
    model.eval()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.sigmoid(outputs.logits)[0]
    return probs.cpu().numpy()

predict("‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ñ‡ßÅ‡¶¨ ‡¶¨‡¶æ‡¶ú‡ßá ‡¶ï‡¶•‡¶æ ‡¶¨‡¶≤‡¶õ‡ßã")
