# A Single-Label NLP-Based Digital Forensic Framework for Bangla Social Media Content Analysis

**Authors:** Ashif Rabbani, Md. Sakib Muhtadee, Jannatul Ferdous

**Environment:** Google Colab

**Data Source:** Kaggle, Github

## 1) Multi Labeled Bengali Toxic Comments

IEEE: https://doi.org/10.1109/ECCE57851.2023.10101588

arXiv: https://arxiv.org/abs/2304.04087

ReserchGate: https://www.researchgate.net/publication/369924719_Interpretable_Multi_Labeled_Bengali_Toxic_Comments_Classification_using_Deep_Learning



In [1]:
import kagglehub
import os

# User-specified directory
#download_dir = "/home/sakib/Documents/MSC Project/contents"  # change as you like
#os.environ['KAGGLEHUB_CACHE'] = download_dir

# Download latest version to the specified path
path = kagglehub.dataset_download("tanveerbelaliut/multi-labeled-bengali-toxic-comments")
print("Dataset downloaded to:", path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/tanveerbelaliut/multi-labeled-bengali-toxic-comments?dataset_version_number=1...


100%|██████████| 904k/904k [00:00<00:00, 65.6MB/s]

Extracting files...
Dataset downloaded to: /root/.cache/kagglehub/datasets/tanveerbelaliut/multi-labeled-bengali-toxic-comments/versions/1





In [2]:
!pwd
import pandas as pd

multilabel_df = pd.read_csv(path+"/Multi_labeled_toxic_comments.csv")
# list of label columns
label_cols = ['vulgar', 'hate', 'religious', 'threat', 'troll', 'Insult']
# create neutral column
multilabel_df['neutral'] = (multilabel_df[label_cols].sum(axis=1) == 0).astype(int)
multilabel_df = multilabel_df.drop(multilabel_df[multilabel_df['troll'] == 1].index)
multilabel_df = multilabel_df.drop(columns=['troll'])
multilabel_df.head(5)

/content


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/multi-labeled-bengali-toxic-comments/Multi_labeled_toxic_comments.csv'

In [None]:
multilabel_df = multilabel_df.rename(columns={
    "vulgar": "toxic",
    "hate": "hate_speech",
    "religious": "harassment",
    "threat": "violence",
    "Insult": "cyberbullying"
})


label_cols = [
    "toxic",
    "hate_speech",
    "harassment",
    "violence",
    "cyberbullying",
    "neutral"
]

multilabel_df["label"] = multilabel_df[label_cols].idxmax(axis=1)

# Optional: drop old columns
multilabel_df= multilabel_df.drop(columns=label_cols)
multilabel_df.head(5)

In [None]:
!pip install torch torchvision torchaudio transformers datasets scikit-learn

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)




In [None]:
label_encoder = LabelEncoder()
multilabel_df["label_id"] = label_encoder.fit_transform(multilabel_df["label"])
NUM_LABELS = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)


In [None]:
class BanglaDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': label
        }


In [None]:
train_df, val_df = train_test_split(multilabel_df, test_size=0.1, random_state=42, stratify=multilabel_df["label_id"])


In [None]:
MODEL_NAME = "sagorsarker/bangla-bert-base"  # you can also use a smaller DistilBanglaBERT

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
model = model.to(device)


In [None]:
train_dataset = BanglaDataset(train_df["text"].tolist(), train_df["label_id"].tolist(), tokenizer)
val_dataset = BanglaDataset(val_df["text"].tolist(), val_df["label_id"].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # GTX 1070 safe
val_loader = DataLoader(val_dataset, batch_size=8)



In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()  # single-label classification



In [None]:
EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}")


In [None]:
def evaluate(model, dataloader):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

            preds.extend(predictions)
            true_labels.extend(labels)

    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average='micro')
    return {"accuracy": acc, "micro_f1": f1}

metrics = evaluate(model, val_loader)
print(metrics)
