In [1]:
! pip install datasets transformers huggingface_hub

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [2]:
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertForSequenceClassification, DistilBertConfig

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
data = load_dataset("agraj07/phishing_dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/515 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/130k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/28.3k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2800 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/600 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/600 [00:00<?, ? examples/s]

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2800
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 600
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 600
    })
})

In [5]:
device  = torch.device('cuda')

In [7]:
torch.cuda.is_available()

True

In [8]:
model_path = 'agraj07/bert-phising-classifier-teacher'

tokenizer = AutoTokenizer.from_pretrained(model_path)
teacher_model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [9]:
my_config = DistilBertConfig(n_heads=8, n_layers=4)

student_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                                    config=my_config).to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def preprocess(examples):
  return tokenizer(examples['text'], padding = 'max_length', truncation = True)

tokenized_data = data.map(preprocess, batched=True)
tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [12]:
def evaluate_model(model, dataloader, device):
  model.eval()
  all_preds = []
  all_labels = []

  #Disable gradient calculations
  with torch.no_grad():
    for batch in dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      #forward pass to get logits
      outputs = model(input_ids, attention_mask=attention_mask)
      logits = outputs.logits

      #get predictions
      preds = torch.argmax(logits, dim=1).cpu().numpy()
      all_preds.extend(preds)
      all_labels.extend(labels.cpu().numpy())

  accuracy = accuracy_score(all_labels, all_preds)
  precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

  return accuracy, precision, recall, f1

In [14]:
def distillation_loss(student_logits, teacher_logits, true_labels, temperature, alpha):
  #compute soft targets from teacher logits
  soft_targets = nn.functional.softmax(teacher_logits/temperature, dim=1)
  student_soft = nn.functional.log_softmax(student_logits/temperature, dim=1)

  #KL Divergence loss for distillation
  distill_loss = nn.functional.kl_div(student_soft, soft_targets, reduction='batchmean') * (temperature **2)

  #Cross entropy loss for hard labels
  hard_loss = nn.CrossEntropyLoss()(student_logits, true_labels)

  #Combine Loss
  loss = alpha * distill_loss + (1.0 - alpha) * hard_loss
  return loss

In [15]:
batch_size = 32
lr = 1e-4
num_epochs = 5
temperature = 2.0
alpha = 0.5

optimizer = optim.Adam(student_model.parameters(), lr=lr)

#create training dataloader
dataloader = DataLoader(tokenized_data['train'], batch_size=batch_size)
test_dataloader = DataLoader(tokenized_data['test'], batch_size=batch_size)

In [18]:
student_model.train()

for epoch in range(num_epochs):
  for batch in dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
      teacher_outputs = teacher_model(input_ids, attention_mask = attention_mask)
      teacher_logits = teacher_outputs.logits

    student_outputs = student_model(input_ids, attention_mask=attention_mask)
    student_logits = student_outputs.logits

    #compute Distillation loss
    loss = distillation_loss(student_logits, teacher_logits, labels, temperature, alpha)

    #Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print(f"Epoch {epoch+1} complated with loss {loss.item()}")

  #Evaluate teacher model
  teacher_accuracy, teacher_precision, teacher_recall, teacher_f1 = evaluate_model(teacher_model, test_dataloader, device)
  print(f"Teacher (test) Accuracy: {teacher_accuracy:.4f}, Precision : {teacher_precision:.4f}, Recall : {teacher_recall:.4f}, F1 : {teacher_f1:.4f}")

  #Evaluate student model
  student_accuracy, student_precision, student_recall, student_f1 = evaluate_model(student_model, test_dataloader, device)
  print(f"Student (test) Accuracy: {student_accuracy:.4f}, Precision : {student_precision:.4f}, Recall : {student_recall:.4f}, F1 : {student_f1:.4f}")
  print("\n")

  student_model.train()

Epoch 1 complated with loss 0.11816595494747162
Teacher (test) Accuracy: 0.8717, Precision : 0.8878, Recall : 0.8557, F1 : 0.8715
Student (test) Accuracy: 0.8733, Precision : 0.8189, Recall : 0.9639, F1 : 0.8855


Epoch 2 complated with loss 0.08992654085159302
Teacher (test) Accuracy: 0.8717, Precision : 0.8878, Recall : 0.8557, F1 : 0.8715
Student (test) Accuracy: 0.9050, Precision : 0.9493, Recall : 0.8590, F1 : 0.9019


Epoch 3 complated with loss 0.07638518512248993
Teacher (test) Accuracy: 0.8717, Precision : 0.8878, Recall : 0.8557, F1 : 0.8715
Student (test) Accuracy: 0.9200, Precision : 0.9241, Recall : 0.9180, F1 : 0.9211


Epoch 4 complated with loss 0.09797876328229904
Teacher (test) Accuracy: 0.8717, Precision : 0.8878, Recall : 0.8557, F1 : 0.8715
Student (test) Accuracy: 0.9133, Precision : 0.8776, Recall : 0.9639, F1 : 0.9187


Epoch 5 complated with loss 0.07620573043823242
Teacher (test) Accuracy: 0.8717, Precision : 0.8878, Recall : 0.8557, F1 : 0.8715
Student (test)

In [19]:
# Evaluate model on validation data

validation_dataloader = DataLoader(tokenized_data['validation'], batch_size=8)

teacher_accuracy, teacher_precision, teacher_recall, teacher_f1 = evaluate_model(teacher_model, validation_dataloader, device)
print(f"Teacher - (validation) Accuracy: {teacher_accuracy:.4f}, Precision : {teacher_precision:.4f}, Recall : {teacher_recall:.4f}, F1 : {teacher_f1:.4f}")

student_accuracy, student_precision, student_recall, student_f1 = evaluate_model(student_model, validation_dataloader, device)
print(f"Student - (validation) Accuracy: {student_accuracy:.4f}, Precision : {student_precision:.4f}, Recall : {student_recall:.4f}, F1 : {student_f1:.4f}")

Teacher - (validation) Accuracy: 0.8967, Precision : 0.9196, Recall : 0.8709, F1 : 0.8946
Student - (validation) Accuracy: 0.9283, Precision : 0.9512, Recall : 0.9040, F1 : 0.9270


In [20]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
student_model.push_to_hub("agraj07/bert-phising-classifier-student")

model.safetensors:   0%|          | 0.00/211M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/agraj07/bert-phising-classifier-student/commit/ef84241a5352d11b60c5e1c9e73f39097493b5d4', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='ef84241a5352d11b60c5e1c9e73f39097493b5d4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/agraj07/bert-phising-classifier-student', endpoint='https://huggingface.co', repo_type='model', repo_id='agraj07/bert-phising-classifier-student'), pr_revision=None, pr_num=None)