## Part 2:

## Now we are going to tokenize the URLs and prepare to train a GPT (Generative Pre-trained Transformer) model from scratch.

In [None]:
import joblib
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from torch.utils.data import DataLoader, Dataset
from transformers import DataCollatorWithPadding, GPT2ForSequenceClassification, GPT2Tokenizer, Trainer, TrainingArguments
from urllib.parse import urlparse

Also, thanks to the great documentation here: https://huggingface.co/docs/transformers/model_doc/gpt2 

Note there is a choice here to set the max_length to 1024 - we could also use a sliding window approach

In [None]:
df = pd.read_csv('./datasets/conglom-labeled.csv', names=['URL', 'Classification'])
df = df.iloc[1:]

In [None]:
label_encoder = LabelEncoder()
df['Classification'] = label_encoder.fit_transform(df['Classification'])

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokens = df['URL'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=1024, truncation=True))

In [None]:
with open('./models-checkpoints/tokenizedURLs.pkl', 'wb') as f: # save in a pickle
    pickle.dump(tokens, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# optionally you may want to load tokenized URLs from pickle
with open('./models-checkpoints/tokenizedURLs.pkl', 'rb') as f:
    tokens = pickle.load(f)

tokens_list = tokens.tolist()
labels_list = df['Classification'].tolist()

class URLDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.encodings[idx], dtype=torch.long)}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

dataset = URLDataset(tokens_list, labels_list)

In [None]:
model = GPT2ForSequenceClassification.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id #pad token

training_args = TrainingArguments(              # training args
    output_dir='./models-checkpoints',          # output directory
    num_train_epochs=3,                         # total number of training epochs
    per_device_train_batch_size=8,              # batch size per device during training
    warmup_steps=500,                           # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                          # strength of weight decay
    logging_dir='./logs',                       # directory for storing logs
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)
trainer.train(resume_from_checkpoint=True) # add resume_from_checkpoint=True 
# note on a M3 mbpro it took approx 14 hours w/ GPU pinned
# TODO: performance and adjust checkpointing so it doesn't take up so much space

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/275085 [00:00<?, ?it/s]

{'loss': 0.0698, 'grad_norm': 0.008444858714938164, 'learning_rate': 5e-05, 'epoch': 0.01}
{'loss': 0.0036, 'grad_norm': 5.6408843994140625, 'learning_rate': 4.9908953511663056e-05, 'epoch': 0.01}
{'loss': 0.0063, 'grad_norm': 1.3369055523071438e-05, 'learning_rate': 4.981790702332611e-05, 'epoch': 0.02}
{'loss': 0.0026, 'grad_norm': 6.454928552557249e-06, 'learning_rate': 4.9726860534989164e-05, 'epoch': 0.02}
{'loss': 0.0035, 'grad_norm': 4.169051317148842e-05, 'learning_rate': 4.9635814046652225e-05, 'epoch': 0.03}
{'loss': 0.0104, 'grad_norm': 5.472015982377343e-05, 'learning_rate': 4.954476755831528e-05, 'epoch': 0.03}
{'loss': 0.0056, 'grad_norm': 0.001269070664420724, 'learning_rate': 4.945372106997833e-05, 'epoch': 0.04}
{'loss': 0.0045, 'grad_norm': 1.091237209038809e-05, 'learning_rate': 4.936267458164139e-05, 'epoch': 0.04}
{'loss': 0.0088, 'grad_norm': 0.0031151340808719397, 'learning_rate': 4.927162809330445e-05, 'epoch': 0.05}
{'loss': 0.0055, 'grad_norm': 0.0075260316953

TrainOutput(global_step=275085, training_loss=0.0015560091136957574, metrics={'train_runtime': 51322.9101, 'train_samples_per_second': 42.879, 'train_steps_per_second': 5.36, 'train_loss': 0.0015560091136957574, 'epoch': 3.0})

Followup: perf and how long it took? So far 15% at 2 hours 51 min, 33% at 5 hours 16 minutes, 54% at 8 hours.

Total it took 14 hours 15 minutes 22 seconds

In [None]:
model_path = './models-checkpoints/checkpoint-275000'  # adjust for path of model checkpoint you want to use
model = GPT2ForSequenceClassification.from_pretrained(model_path)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # use same eos token as training
text = "https://kagi.com/search?q=openphish" #example text to classify
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=1024)
with torch.no_grad():  # no need to track gradients for ref
    outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) #pred now contains probs for each class

In [None]:
print(predictions)

tensor([[0.5957, 0.4043]])


1. Evaluation of model perf by loading a phishing data set from openphish.com accessed 6 APR 2024

In [None]:
df = pd.read_csv('./datasets/open-phish-test.csv', names=['URL'])
urls = df['URL'].tolist()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

class URLDataset(Dataset):
    def __init__(self, urls, tokenizer, max_length=1024):
        self.urls = urls
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.urls)

    def __getitem__(self, idx):
        url = self.urls[idx]
        inputs = self.tokenizer.encode_plus(
            url,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

test_dataset = URLDataset(urls, tokenizer) #create the test dataset

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

In [None]:
model_path = './models-checkpoints/checkpoint-275000' #load our pre-trained model
model = GPT2ForSequenceClassification.from_pretrained(model_path)
model.eval() #set the model to eval
for batch in test_dataloader: #iterate the dataloader
    with torch.no_grad():
        outputs = model(**batch)
        predictions = torch.argmax(F.softmax(outputs.logits, dim=-1), dim=-1)
        print(predictions)

tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1,