## Part 5:

## Now time to test the gpt2 trained model

In [1]:
import joblib
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from torch.utils.data import DataLoader, Dataset
from transformers import DataCollatorWithPadding, GPT2ForSequenceClassification, GPT2Tokenizer, Trainer, TrainingArguments
from urllib.parse import urlparse

In [2]:
df = pd.read_csv('./datasets/conglom-labeled.csv', names=['URL', 'Classification'])
df = df.iloc[1:]
df.head()
df['Classification'].value_counts()

Classification
benign        428103
defacement     96457
phishing       94111
ads            42489
tracking       39177
malware        33221
Name: count, dtype: int64

In [3]:
label_encoder = LabelEncoder()
df['Classification'] = label_encoder.fit_transform(df['Classification'])

In [4]:
df['Classification'].value_counts()

Classification
1    428103
2     96457
4     94111
0     42489
5     39177
3     33221
Name: count, dtype: int64

In [5]:
# load tokenizedURLs from the pickle
with open('./models-checkpoints/tokenizedURLs.pkl', 'rb') as f:
    tokens = pickle.load(f)

tokens_list = tokens.tolist()
labels_list = df['Classification'].tolist()

class URLDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.encodings[idx], dtype=torch.long)}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

dataset = URLDataset(tokens_list, labels_list)

In [6]:
model_path = './models-checkpoints/checkpoint-275000'  # adjust for path of model checkpoint you want to use
model = GPT2ForSequenceClassification.from_pretrained(model_path, num_labels=6, ignore_mismatched_sizes=True)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ./models-checkpoints/checkpoint-275000 and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
test_dataset = URLDataset(tokens_list, labels_list)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)  # Adjust batch_size as needed

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # Or use your specific tokenizer
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=data_collator)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.eval()  # Ensure model is in evaluation mode

# Initialize lists to store true labels and predictions
true_labels = []
predictions = []

with torch.no_grad():  # No need to track gradients for evaluation
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, labels=labels)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        
        # Move preds and labels to CPU for further operations
        predictions.extend(preds.detach().cpu().numpy())
        true_labels.extend(labels.detach().cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
