# Fine Tine ModernBERT
<img src="https://miro.medium.com/v2/resize:fit:1400/format:webp/0*g22xJSM7v6BvrD9F.png" style="display: block; margin-left: auto; margin-right: auto;" width="200"/>

[Publication](https://arxiv.org/pdf/2412.13663)

## What is ModernBERT?

- Latest encoding family model
  - Trained with 2T tokens
  - 8192 sequence length
- Better, faster, smarter
  - Major pareto improvement


## What are the BERT family models?
- Encoder only
- Representing text
- Classifying text
- Meaningful embeddings

## Why should you care?
- Specialized semantic representation for security terms
- Classification of malicious vs benign

In [None]:
# Install Pytorch & other libraries
%pip install "torch==2.5.0" "torchvision==0.20.0"
%pip install "setuptools<71.0.0" scikit-learn
%pip install python-dotenv

# Install Hugging Face libraries
%pip install  --upgrade \
  "datasets==3.1.0" \
  "accelerate==1.2.1" \
  "hf-transfer==0.1.8"

# ModernBERT is not yet available in an official release, so we need to install it from github
%pip install "git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1" --upgrade

In [None]:
import os

from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from datasets.dataset_dict import DatasetDict, IterableDatasetDict
from datasets.iterable_dataset import IterableDataset

In [None]:
from google.colab import userdata
hugging_face_key = userdata.get('HUGGING_FACE_KEY')

In [None]:
from huggingface_hub import login

login(token=hugging_face_key, add_to_git_credential=True)

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

## Datasets

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Define your list of strings
list_dataset_ids = ['ealvaradob/phishing-dataset', 'Anvilogic/URL-Guardian-Dataset', 'tegridydev/open-malsec']

# Create the dropdown widget
dropdown = widgets.Dropdown(
    options=list_dataset_ids,
    description='Select an dataset:',
    disabled=False,
)

# Display the dropdown
display(dropdown)

In [None]:
dataset_id = dropdown.value

In [None]:
# Load raw dataset, adding trust_remote_code=True
train_dataset = load_dataset(dataset_id, split="train", trust_remote_code=True)

split_dataset = train_dataset.train_test_split(test_size=0.1)
split_dataset["train"][0]

In [None]:
for item in split_dataset["train"].select(range(0, 1000)):
  print(item)

In [None]:
# reduce dataset size to half
import datasets

reduced_dataset = datasets.DatasetDict({
    'train': split_dataset['train'].select(range(split_dataset['train'].num_rows // 2)),
    'test': split_dataset['test'].select(range(split_dataset['test'].num_rows // 2))
})

## Tokenize
<img src="https://miro.medium.com/v2/resize:fit:1400/format:webp/1*mkvzSPMiX5FZcuQjFe2B6w.png" style="display: block; margin-left: auto; margin-right: auto;" width="500"/>

(image from [The Art of Tokenization: Breaking Down Text for AI](https://medium.com/data-science/the-art-of-tokenization-breaking-down-text-for-ai-43c7bccaed25))

In [None]:
# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"

In [None]:
%%time
from transformers import AutoTokenizer

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)


# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256, return_tensors="pt")


# Tokenize dataset
# Note: here we can use reduced_dataset OR split_dataset
if "label" in reduced_dataset["train"].features.keys():
    reduced_dataset = reduced_dataset.rename_column("label", "labels")  # to match Trainer
tokenized_dataset = reduced_dataset.map(tokenize, batched=True, remove_columns=["text"])

tokenized_dataset["train"].features.keys()

In [None]:
import datasets

# Save the tokenized dataset to disk
tokenized_dataset.save_to_disk('tokenized_dataset_reduced')

In [None]:
tokenized_dataset = datasets.load_from_disk('tokenized_dataset')

## Process labels

In [None]:
tokenized_dataset.column_names

In [None]:
%%time
from transformers import AutoModelForSequenceClassification

# Prepare model labels - useful for inference
labels = tokenized_dataset["train"].unique("labels")
num_labels = len(labels)

In [None]:
%%time
label2id, id2label = dict(), dict()
# for i, label in enumerate(labels):
#     label2id[label] = str(i)
#     id2label[str(i)] = label

label2id['benign'] = 0
label2id['malicious'] = 1
id2label[0] = 'benign'
id2label[1] = 'malicious'

# Download the model from huggingface.co/models
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

## Evaluate

In [None]:
import numpy as np
from sklearn.metrics import f1_score


# Metric helper method
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(
        labels, predictions, labels=labels, pos_label=1, average="weighted"
    )
    return {"f1": float(score) if score == 1 else score}

## Fine Tuning

Fine-tuning means adjusting the weights of a pre-trained model on a new dataset for better performance and specialization in a specific task.

<img src="https://substackcdn.com/image/fetch/w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F52844fcc-6094-4fdb-ba8c-52737ab9c821_1640x402.gif" style="display: block; margin-left: auto; margin-right: auto;" width="800"/>

(image from [Daily Dose of Data Science](https://blog.dailydoseofds.com/p/full-model-fine-tuning-vs-lora-vs))

In [None]:
%%time
# TODO: figure out why loss nan and no improvement in F1
import torch
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments

# Define training args
training_args = TrainingArguments(
    output_dir="./ModernBERT-domain-classifier",
    per_device_train_batch_size=8,  # Reduced to 8
    per_device_eval_batch_size=4,  # Reduced to 4
    learning_rate=5e-5,
    num_train_epochs=3,
    fp16=True,
    optim="adamw_torch_fused",
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_token=hugging_face_key,
    gradient_checkpointing=True,
    gradient_accumulation_steps=1, # Reduced to 1
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

torch.cuda.empty_cache()
trainer.train()

## Test

In [None]:
# Save the model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [None]:
split_dataset["test"][42]['text']

In [None]:
%%time
from transformers import pipeline

# load model from huggingface.co/models using our repository id
classifier = pipeline(
    task="text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0,
)

sample = split_dataset["test"][42]['text']

# Get the model's configuration
config = classifier.model.config

# Check if the key 0 exists in id2label, if not, add it with a default label
if 0 not in config.id2label:
    # Choose a suitable default label
    default_label = "unknown"
    config.id2label[0] = default_label
    config.label2id[default_label] = 0

classifier(sample)
# [{'label': 'health', 'score': 0.6779336333274841}]