In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertForSequenceClassification, DistilBertModel, DistilBertTokenizer, AdamW, get_scheduler
from tqdm import tqdm
import time

# Check if the current `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
# is available, and if not, use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [3]:
# Load and drop missing values
df = pd.read_excel("train.xlsx")
df = df.dropna(subset=["comment_text", "cyberbullying"])

# Ensure types
df["comment_text"] = df["comment_text"].astype(str)
df["cyberbullying"] = df["cyberbullying"].astype(int)

# Extra safety: remove non-stringy things
def is_valid_text(t):
    return isinstance(t, str) and len(t.strip()) > 0

df = df[df["comment_text"].apply(is_valid_text)]

# Final clean list conversion
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["comment_text"].tolist(),
    df["cyberbullying"].tolist(),
    test_size=0.3,
    random_state=42
)

# Split train into train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts,
    train_labels,
    test_size=0.1,  # 10% of the original train set goes to validation
    random_state=42
)


In [11]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize with attention masks and return tensors
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")


In [12]:
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

In [14]:
class CyberbullyingDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = CyberbullyingDataset(train_encodings, train_labels)
val_dataset = CyberbullyingDataset(val_encodings, val_labels)
test_dataset = CyberbullyingDataset(test_encodings, test_labels)


In [15]:
teacher_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # Binary classification
)
teacher_model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [18]:
optimizer = AdamW(teacher_model.parameters(), lr=2e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

loss_fn = torch.nn.CrossEntropyLoss()



In [19]:
for epoch in range(num_epochs):
    start_time = time.time()

    teacher_model.train()
    total_loss = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)

    # Validation loop
    teacher_model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    end_time = time.time()
    epoch_time = end_time - start_time

    print(f"\nEpoch {epoch+1} training loss: {avg_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Epoch {epoch+1} took {epoch_time:.2f} seconds\n")

Epoch 1/3: 100%|██████████| 6279/6279 [14:46<00:00,  7.08it/s, loss=0.0273]  



Epoch 1 training loss: 0.0957
Validation Accuracy: 0.9705
Epoch 1 took 913.09 seconds



Epoch 2/3: 100%|██████████| 6279/6279 [14:33<00:00,  7.19it/s, loss=0.000222]



Epoch 2 training loss: 0.0526
Validation Accuracy: 0.9693
Epoch 2 took 899.95 seconds



Epoch 3/3: 100%|██████████| 6279/6279 [14:36<00:00,  7.16it/s, loss=8.55e-5] 



Epoch 3 training loss: 0.0209
Validation Accuracy: 0.9678
Epoch 3 took 903.10 seconds



In [20]:
student_model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)
student_model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [21]:
def distill_student(
    teacher_model, student_model,
    train_loader, val_loader,
    optimizer, temperature=2.0, alpha=0.5, epochs=3
):
    loss_ce = nn.CrossEntropyLoss()  # For hard labels

    for epoch in range(epochs):
        student_model.train()
        teacher_model.eval()
        total_loss = 0
        start_time = time.time()

        loop = tqdm(train_loader, desc=f"Distill Epoch {epoch+1}/{epochs}")

        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Teacher predictions
            with torch.no_grad():
                teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
                teacher_logits = teacher_outputs.logits / temperature

            # Student predictions
            student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
            student_logits = student_outputs.logits / temperature

            # Distillation loss (KL + CE)
            soft_loss = F.kl_div(
                input=F.log_softmax(student_logits, dim=-1),
                target=F.softmax(teacher_logits, dim=-1),
                reduction="batchmean"
            ) * (temperature ** 2)

            hard_loss = loss_ce(student_outputs.logits, labels)
            loss = alpha * soft_loss + (1 - alpha) * hard_loss
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loop.set_postfix(loss=loss.item())

        avg_train_loss = total_loss / len(train_loader)
        end_time = time.time()
        print(f"\nEpoch {epoch+1} distillation loss: {avg_train_loss:.4f}")
        print(f"Epoch time: {end_time - start_time:.2f}s")

        # Validation
        student_model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                val_loss += loss_ce(logits, labels).item()

                predictions = torch.argmax(logits, dim=-1)
                correct += (predictions == labels).sum().item()
                total += labels.size(0)

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct / total
        print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}\n")



In [None]:

student_optimizer = AdamW(student_model.parameters(), lr=2e-5)

distill_student(
    teacher_model=teacher_model,
    student_model=student_model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=student_optimizer,
    temperature=2.0,
    alpha=0.5,
    epochs=3
)


Distill Epoch 1/3: 100%|██████████| 6279/6279 [12:02<00:00,  8.69it/s, loss=0.00601] 



Epoch 1 distillation loss: 0.1704
Epoch time: 722.66s
Validation Loss: 0.0973, Accuracy: 0.9672



Distill Epoch 2/3: 100%|██████████| 6279/6279 [12:04<00:00,  8.67it/s, loss=0.114]   



Epoch 2 distillation loss: 0.0897
Epoch time: 724.30s
Validation Loss: 0.0998, Accuracy: 0.9702



Distill Epoch 3/3: 100%|██████████| 6279/6279 [11:50<00:00,  8.84it/s, loss=0.00365] 



Epoch 3 distillation loss: 0.0443
Epoch time: 710.27s
Validation Loss: 0.1300, Accuracy: 0.9686

