In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertForSequenceClassification, get_scheduler
from tqdm import tqdm
import time
import joblib

# Check if the current `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
# is available, and if not, use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cpu device


In [None]:
# Load and drop missing values
df = pd.read_excel("train.xlsx")
df = df.dropna(subset=["comment_text", "cyberbullying"])

# Ensure types
df["comment_text"] = df["comment_text"].astype(str)
df["cyberbullying"] = df["cyberbullying"].astype(int)

# Extra safety: remove non-stringy things
def is_valid_text(t):
    return isinstance(t, str) and len(t.strip()) > 0

df = df[df["comment_text"].apply(is_valid_text)]

# Final clean list conversion
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["comment_text"].tolist(),
    df["cyberbullying"].tolist(),
    test_size=0.3, # 30% test size
    random_state=42
)

# Split train into train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts,
    train_labels,
    test_size=0.1,  # 10% of the original train set goes to validation
    random_state=42
)


In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize with attention masks and return tensors
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")


In [4]:
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

In [5]:
class CyberbullyingDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = CyberbullyingDataset(train_encodings, train_labels)
val_dataset = CyberbullyingDataset(val_encodings, val_labels)
test_dataset = CyberbullyingDataset(test_encodings, test_labels)


In [6]:
teacher_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # Binary classification
)
teacher_model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [7]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [8]:
optimizer = AdamW(teacher_model.parameters(), lr=2e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

loss_fn = torch.nn.CrossEntropyLoss()



In [9]:
for epoch in range(num_epochs):
    start_time = time.time()

    teacher_model.train()
    total_loss = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)

    # Validation loop
    teacher_model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    end_time = time.time()
    epoch_time = end_time - start_time

    print(f"\nEpoch {epoch+1} training loss: {avg_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Epoch {epoch+1} took {epoch_time:.2f} seconds\n")

Epoch 1/3: 100%|██████████| 6279/6279 [14:14<00:00,  7.35it/s, loss=0.0182]  



Epoch 1 training loss: 0.0958
Validation Accuracy: 0.9694
Epoch 1 took 880.56 seconds



Epoch 2/3: 100%|██████████| 6279/6279 [14:12<00:00,  7.36it/s, loss=0.000743]



Epoch 2 training loss: 0.0517
Validation Accuracy: 0.9678
Epoch 2 took 879.03 seconds



Epoch 3/3: 100%|██████████| 6279/6279 [14:11<00:00,  7.38it/s, loss=0.00163] 



Epoch 3 training loss: 0.0214
Validation Accuracy: 0.9689
Epoch 3 took 877.55 seconds



In [10]:
student_model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)
student_model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [16]:
def distill_student(
    teacher_model, student_model,
    train_loader, val_loader,
    optimizer, temperature=2.0, alpha=0.5, epochs=5
):
    loss_ce = nn.CrossEntropyLoss()  # For hard labels

    for epoch in range(epochs):
        student_model.train()
        teacher_model.eval()
        total_loss = 0
        start_time = time.time()

        loop = tqdm(train_loader, desc=f"Distill Epoch {epoch+1}/{epochs}")

        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Teacher predictions
            with torch.no_grad():
                teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
                teacher_logits = teacher_outputs.logits / temperature

            # Student predictions
            student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
            student_logits = student_outputs.logits / temperature

            # Distillation loss (KL + CE)
            soft_loss = F.kl_div(
                input=F.log_softmax(student_logits, dim=-1),
                target=F.softmax(teacher_logits, dim=-1),
                reduction="batchmean"
            ) * (temperature ** 2)

            hard_loss = loss_ce(student_outputs.logits, labels)
            loss = alpha * soft_loss + (1 - alpha) * hard_loss
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loop.set_postfix(loss=loss.item())

        avg_train_loss = total_loss / len(train_loader)
        end_time = time.time()
        print(f"\nEpoch {epoch+1} distillation loss: {avg_train_loss:.4f}")
        print(f"Epoch time: {end_time - start_time:.2f}s")

        # Validation
        student_model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                val_loss += loss_ce(logits, labels).item()

                predictions = torch.argmax(logits, dim=-1)
                correct += (predictions == labels).sum().item()
                total += labels.size(0)

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct / total
        print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}\n")

    return student_model




In [None]:
student_optimizer = AdamW(student_model.parameters(), lr=2e-5)

binary_classification_kd = distill_student(
    teacher_model=teacher_model,
    student_model=student_model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=student_optimizer,
    temperature=2.0,
    alpha=0.5,
    epochs=3
)


Distill Epoch 1/3: 100%|██████████| 6279/6279 [11:29<00:00,  9.11it/s, loss=0.00206] 



Epoch 1 distillation loss: 0.0307
Epoch time: 689.34s
Validation Loss: 0.1390, Accuracy: 0.9667



Distill Epoch 2/3: 100%|██████████| 6279/6279 [11:44<00:00,  8.91it/s, loss=9.7e-5]  



Epoch 2 distillation loss: 0.0233
Epoch time: 704.33s
Validation Loss: 0.1407, Accuracy: 0.9677



Distill Epoch 3/3: 100%|██████████| 6279/6279 [11:36<00:00,  9.02it/s, loss=0.0176]  



Epoch 3 distillation loss: 0.0208
Epoch time: 696.38s
Validation Loss: 0.1428, Accuracy: 0.9654



In [4]:
torch.save(binary_classification_kd.state_dict(), 'models/binary_classification_kd.pth')

NameError: name 'binary_classification_kd' is not defined

In [10]:
from transformers import DistilBertForSequenceClassification

student_model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)
student_model.load_state_dict(torch.load('models/binary_classification_kd.pth'))
student_model.to(device)

  mj = torch.load('models/binary_classification_kd.joblib',map_location=torch.device('cpu'))


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
from transformers import DistilBertTokenizer
import torch

# Load model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)
model.load_state_dict(torch.load("models/binary_classification_kd.pth"))
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to GPU if available

text = ["You're such a loser!"]
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the same device

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

print("Prediction:", predictions.item())



Prediction: 1


In [11]:
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType

In [12]:
# Export the trained student model to ONNX format
onnx_model_path = "models/binary_classification_kd.onnx"

dummy_input = {
    "input_ids": torch.randint(0, 1000, (1, 128)).to(device),
    "attention_mask": torch.ones((1, 128)).to(device)
}

torch.onnx.export(
    student_model,
    (dummy_input["input_ids"], dummy_input["attention_mask"]),
    onnx_model_path,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={"input_ids": {0: "batch_size"}, "attention_mask": {0: "batch_size"}},
    opset_version=14
)

print(f"ONNX model exported to {onnx_model_path}")

NameError: name 'student_model' is not defined

In [None]:
# Quantize the ONNX model for optimized inference
quantized_model_path = "models/binary_classification_kd_quantized.onnx"

quantize_dynamic(
    model_input=onnx_model_path,
    model_output=quantized_model_path,
    weight_type=QuantType.QINT8
)

print(f"Quantized ONNX model saved to {quantized_model_path}")

In [None]:
# Load and test the quantized ONNX model
ort_session = ort.InferenceSession(quantized_model_path)

text = ["You're such a loser!"]
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="np")

onnx_inputs = {
    "input_ids": inputs["input_ids"].numpy(),
    "attention_mask": inputs["attention_mask"].numpy()
}

onnx_outputs = ort_session.run(None, onnx_inputs)
predictions = torch.argmax(torch.tensor(onnx_outputs[0]), dim=1)

print("ONNX Prediction:", predictions.item())