##Imports


In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertForSequenceClassification, DistilBertConfig
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

##Load Data

In [2]:
data = load_dataset("shawhin/phishing-site-classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
print(data)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
})


##Load Teacher


In [4]:
# use Nvidia GPU
device = torch.device('cuda')

# Load teacher model and tokenizer
model_path = "shawhin/bert-phishing-classifier_teacher"
tokenizer = AutoTokenizer.from_pretrained(model_path)
teacher_model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

##Load Student

In [5]:
# Load student model
my_config = DistilBertConfig(n_heads=8, n_layers=4)
student_model = DistilBertForSequenceClassification \
    .from_pretrained("distilbert-base-uncased", config=my_config) \
    .to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##Tokenize Text

In [6]:
# define text preprocessing
def preprocess_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

# tokenize all datasets
tokenized_data = data.map(preprocess_function, batched=True)
tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

## Evaluation Function

In [7]:
# Function to evaluate model performance
def evaluate_model(model, dataloader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    # Disable gradient calculations during evaluation
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forwrd pass to get logits
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get predictions
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Calculate evaluation metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

    return accuracy, precision, recall, f1

##Custom Loss Function

In [8]:
# Function to compute distillation and hard-label loss
def distillation_loss(student_logits, teacher_logits, true_labels, temperature, alpha):
    # Compute soft targets from teacher logits
    soft_targets = nn.functional.softmax(teacher_logits / temperature, dim=1)
    student_soft = nn.functional.log_softmax(student_logits / temperature, dim=1)

    # KL Divergence loss for distillation
    distill_loss = nn.functional.kl_div(student_soft,
                                        soft_targets,
                                        reduction='batchmean') * (temperature ** 2)

    # Cross-entropy loss for hard labels
    hard_loss = nn.CrossEntropyLoss()(student_logits, true_labels)

    # Combine losses
    loss = alpha * distill_loss + (1.0 - alpha) * hard_loss

    return loss

##Hyperparameters

In [9]:
# hyperparameters
batch_size = 32
lr = 1e-4 #5e-5
num_epochs = 5
temperature = 2.0
alpha = 0.5

# define optimizer
optimizer = optim.Adam(student_model.parameters(), lr=lr)

# create training data loader
dataloader = DataLoader(tokenized_data['train'], batch_size=batch_size)
# create testing data loader
test_dataloader = DataLoader(tokenized_data['test'], batch_size=batch_size)

##Train Model

In [10]:
# put student model in train mode
student_model.train()

# train model
for epoch in range(num_epochs):
    for batch in dataloader:
        # Prepare inputs
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Disable gradient calculation for teacher model
        with torch.no_grad():
            teacher_outputs = teacher_model(input_ids=input_ids,
                                            attention_mask=attention_mask)
            teacher_logits = teacher_outputs.logits

        # Forward pass through the student model
        student_outputs = student_model(input_ids=input_ids,
                                        attention_mask=attention_mask)
        student_logits = student_outputs.logits

        # Compute the distillation loss
        loss = distillation_loss(student_logits, teacher_logits, labels,
                                 temperature, alpha)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} completed with loss: {loss.item()}")

    # Evaluate the teacher model
    teacher_accuracy, teacher_precision, teacher_recall, teacher_f1 = \
        evaluate_model(teacher_model, test_dataloader, device)

    print(f"Teacher (test) - Accuracy: {teacher_accuracy:.4f},\n"
          f"               Precision: {teacher_precision:.4f},\n"
          f"               Recall: {teacher_recall:.4f},\n"
          f"               F1 Score: {teacher_f1:.4f}")

    # Evaluate the student model
    student_accuracy, student_precision, student_recall, student_f1 = \
        evaluate_model(student_model, test_dataloader, device)

    print(f"Student (test) - Accuracy: {student_accuracy:.4f},\n"
          f"               Precision: {student_precision:.4f},\n"
          f"               Recall: {student_recall:.4f},\n"
          f"               F1 Score: {student_f1:.4f}")

    print("\n")

    # put student model back into train mode
    student_model.train()

Epoch 1 completed with loss: 0.15253493189811707
Teacher (test) - Accuracy: 0.8644,
               Precision: 0.8925,
               Recall: 0.8341,
               F1 Score: 0.8623
Student (test) - Accuracy: 0.9000,
               Precision: 0.8802,
               Recall: 0.9301,
               F1 Score: 0.9045


Epoch 2 completed with loss: 0.08104458451271057
Teacher (test) - Accuracy: 0.8644,
               Precision: 0.8925,
               Recall: 0.8341,
               F1 Score: 0.8623
Student (test) - Accuracy: 0.8978,
               Precision: 0.9463,
               Recall: 0.8472,
               F1 Score: 0.8940


Epoch 3 completed with loss: 0.05891669541597366
Teacher (test) - Accuracy: 0.8644,
               Precision: 0.8925,
               Recall: 0.8341,
               F1 Score: 0.8623
Student (test) - Accuracy: 0.9022,
               Precision: 0.8807,
               Recall: 0.9345,
               F1 Score: 0.9068


Epoch 4 completed with loss: 0.064979188144207
Teacher 

##Validation Set Evaluation

In [12]:
# create testing data loader
validation_dataloader = DataLoader(tokenized_data['validation'], batch_size=8)

# Evaluate the teacher model
teacher_accuracy, teacher_precision, teacher_recall, teacher_f1 = \
    evaluate_model(teacher_model, validation_dataloader, device)
print(f"Teacher (validation) - Accuracy: {teacher_accuracy:.4f},\n"
      f"                     Precision: {teacher_precision:.4f},\n"
      f"                     Recall: {teacher_recall:.4f},\n"
      f"                     F1 Score: {teacher_f1:.4f}")

# Evaluate the student model
student_accuracy, student_precision, student_recall, student_f1 = \
    evaluate_model(student_model, validation_dataloader, device)
print(f"Student (validation) - Accuracy: {student_accuracy:.4f},\n"
      f"                     Precision: {student_precision:.4f},\n"
      f"                     Recall: {student_recall:.4f},\n"
      f"                     F1 Score: {student_f1:.4f}")

Teacher (validation) - Accuracy: 0.8933,
                     Precision: 0.9155,
                     Recall: 0.8667,
                     F1 Score: 0.8904
Student (validation) - Accuracy: 0.9311,
                     Precision: 0.9755,
                     Recall: 0.8844,
                     F1 Score: 0.9277


##Load in as 4-bit

In [13]:
from transformers import BitsAndBytesConfig

# load model in model as 4-bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                          bnb_4bit_compute_dtype=torch.bfloat16)

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)


model_id = "shawhin/bert-phishing-classifier_student"
model_nf4 = AutoModelForSequenceClassification.from_pretrained(model_id,
                                                               device_map=device,
                                                               quantization_config=nf4_config)

model.safetensors:   0%|          | 0.00/211M [00:00<?, ?B/s]

##Evaluate Quantized Model

In [14]:
# Evaluate the student model
quantized_accuracy, quantized_precision, quantized_recall, quantized_f1 = \
    evaluate_model(model_nf4, validation_dataloader, device)

print("Post-quantization Performance")
print(f"Accuracy: {quantized_accuracy:.4f},\n"
      f"Precision: {quantized_precision:.4f},\n"
      f"Recall: {quantized_recall:.4f},\n"
      f"F1 Score: {quantized_f1:.4f}")

Post-quantization Performance
Accuracy: 0.9356,
Precision: 0.9757,
Recall: 0.8933,
F1 Score: 0.9327
