# ByT5 Model

In [26]:
# imports
from transformers import TrainingArguments, ByT5Tokenizer
from sklearn.metrics import roc_curve, auc
from utils import ByT5_model, ByT5_utils
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import os

In [27]:
load_dotenv()

# CONSTANTS
SAVE = True
VERSION = "v3"
RESULTS_PATH = os.getenv("RESULTS_PATH")
MODEL_PATH = os.getenv("MODEL_PATH")
MAX_LENGTH = 100

In [None]:
# load data
# need to put in our data here. the data format expects the following format:
# url, label
# url is the whole url
# label is 0 for non-phishing, and 1 for phishing

train_data = pd.read_csv("datasets/train.csv")
val_data = pd.read_csv("datasets/validation.csv")
bench_data = pd.read_csv("datasets/benchmark.csv")

print(f"{len(train_data)} training samples")
print(f"{len(val_data)} validation samples")
print(f"{len(bench_data)} benchmark samples")

In [None]:
# training data
train_count = train_data["label"].value_counts()
val_count = val_data["label"].value_counts()
phish_count = train_count[1] + val_count[1]
benign_count = train_count[0] + val_count[0]
print(f"Training and validation data value count:\nphishing: {phish_count}\nbenign: {benign_count}")

# benchmark data
bench_count = bench_data["label"].value_counts()
phish_count = bench_count[1]
benign_count = bench_count[0]
print(f"Benchmark data value count:\nphishing: {phish_count}\nbenign: {benign_count}")

In [23]:
# intialise model, tokenizer
model = ByT5_model.ByT5ForClassification(
    model_name='google/byt5-small',
    num_labels=2,
)
tokenizer = ByT5Tokenizer.from_pretrained('google/byt5-small')

In [24]:
# hyperparameters
BATCH_SIZE = 128
training_args = TrainingArguments(
    # saving results/checkpoints
    output_dir=RESULTS_PATH,
    save_safetensors=False,

    # evaluation
    eval_strategy="epoch",
    eval_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    
    # saving
    save_strategy="epoch",
    save_steps=1,
    save_total_limit=3,

    # hyperparameters
    learning_rate=0.005,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=100,
    weight_decay=0.01,
)

In [None]:
# training loop
trainer = ByT5_utils.train_ByT5(
    model=model,
    tokenizer=tokenizer,
    train_data=train_data,
    val_data=val_data,
    training_args=training_args,
    patience=3,
    max_length=MAX_LENGTH,
    )

In [None]:
# evaluate trained model
ByT5_utils.evaluate_ByT5(trainer)

In [27]:
# saving model
if SAVE:
    torch.save(
        model.state_dict(),
        f"{MODEL_PATH}/byt5-model-{VERSION}.pth"
    )
    tokenizer.save_pretrained(f"{MODEL_PATH}/byt5-tokenizer-{VERSION}")

In [30]:
# load model
if SAVE:
    model = ByT5_model.ByT5ForClassification()
    model.load_state_dict(torch.load(f"{MODEL_PATH}/byt5-model-{VERSION}.pth", weights_only=True))
    tokenizer = ByT5Tokenizer.from_pretrained(f"{MODEL_PATH}/byt5-tokenizer-{VERSION}")

In [None]:
# predict single url
ByT5_utils.predict_single_url(
    bench_data["url"][0],
    model,
    tokenizer,
    max_length=MAX_LENGTH
)

In [None]:
# predict dataframe
y_true, predicted_classes, y_proba = ByT5_utils.predict_dataframe(
    bench_data,
    model,
    tokenizer,
)

In [None]:
accuracies = ByT5_utils.calculate_accuracy_at_thresholds(y_true, y_proba)
for threshold, accuracy in accuracies:
    print(f"Threshold: {threshold:.2f}, Accuracy: {accuracy:.2f}")

In [None]:
# statistics
ByT5_utils.fpr_comparison(np.array(y_true), np.array(y_proba))

In [None]:
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_true, y_proba)
roc_auc = auc(fpr, tpr)
# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()