Finetuning BERT for text classification: classifying phishing urls


In [None]:
!pip install datasets
!pip install evaluate
!pip install transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
#lib
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

In [None]:
#load dataset
dataset = load_dataset("shawhin/phishing-site-classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/98.0k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/21.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/450 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/450 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
})

In [None]:
#load bert pretrained model
model_name = "google-bert/bert-base-uncased"
#load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
#load model with binary classification head
id2label = {0: "Safe" , 1:"Not Safe"}
label2id = {"Safe": 0 , "Not Safe" : 1}
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2, id2label = id2label, label2id = label2id,)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#freeze all parameters
for name, param in model.base_model.named_parameters():
  param.requires_grad = False

#unfreeze final layer and classification head
for name, param in model.base_model.named_parameters():
  if "pooler" in name:
    param.requires_grad = True

In [None]:
#preprocess data
def preprocess_data(examples):
  return tokenizer(examples['text'] , truncation = True)

tokenized_dataset = dataset.map(preprocess_data ,batched = True)

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [None]:
#datacollator
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [None]:
#metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
  #get predictions
  predictions, labels = eval_pred

  #apply softmax to get probabilities
  probabilities = np.exp(predictions) / np.exp(predictions).sum(-1,
                                                                keepdims=True)
  #use probabilities of the positive class for ROC AUC
  positive_class_probs = probabilities[:, 1]
  #compute auc
  auc = np.round(auc_score.compute(prediction_scores=positive_class_probs,
                                    references=labels)['roc_auc'],3)

  #predict most probable class
  predicted_classes = np.argmax(predictions, axis=1)
  #compute accuracy
  acc = np.round(accuracy.compute(predictions=predicted_classes,
                                    references=labels)['accuracy'],3)

  return {"Accuracy": acc, "AUC": auc}

In [None]:
#hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="bert-phishing-classifier_teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
#trainer class
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Auc
1,0.5049,0.382729,0.816,0.913
2,0.4106,0.338188,0.836,0.932
3,0.3558,0.313868,0.858,0.939
4,0.3564,0.358055,0.842,0.945
5,0.3504,0.335082,0.867,0.948
6,0.3479,0.290868,0.869,0.95
7,0.3344,0.289739,0.878,0.95
8,0.3115,0.289415,0.869,0.95
9,0.3129,0.285038,0.871,0.951
10,0.3138,0.289961,0.867,0.951


TrainOutput(global_step=2630, training_loss=0.3598565772459081, metrics={'train_runtime': 544.3862, 'train_samples_per_second': 38.576, 'train_steps_per_second': 4.831, 'total_flos': 706603239165360.0, 'train_loss': 0.3598565772459081, 'epoch': 10.0})

In [None]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_dataset["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

{'Accuracy': 0.889, 'AUC': 0.945}
