# **BERT with SimpleClassifier**

BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based model that generates contextualized embeddings for text. Unlike traditional techniques like TF-IDF or Bag-of-Words, BERT captures the semantic meaning of words based on their context, making it highly effective for tasks like phishing detection.


In [None]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from torch import nn, optim
from sklearn.metrics import classification_report

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Text to embedding function
def text_to_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        sentence_embedding = outputs.last_hidden_state.mean(dim=1)
    return sentence_embedding.cpu().numpy()

# Load the dataset
dataset = load_dataset("ealvaradob/phishing-dataset", "texts", trust_remote_code=True)

# Prepare embedded data
embedded_data = []
for i in range(len(dataset['train'])):
    text = dataset['train'][i]['text']
    numeric = text_to_embedding(text)  # Get embedding as NumPy array
    embedded_data.append(numeric.squeeze()) # Append to the list, squeezing redundant dimension if it exists

# Convert to PyTorch tensor directly
X = torch.tensor(np.array(embedded_data), dtype=torch.float32)
y = torch.tensor(dataset['train']['label'], dtype=torch.long)  # Labels

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a simple classifier model
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SimpleClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        return self.fc(x)

# Instantiate the classifier
input_dim = X_train.shape[1]  # Number of features (embedding size)
num_classes = len(torch.unique(y))  # Number of unique labels
classifier = SimpleClassifier(input_dim, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)

# Training loop
num_epochs = 10
batch_size = 32

for epoch in range(num_epochs):
    classifier.train()
    permutation = torch.randperm(X_train.size(0))
    total_loss = 0

    for i in range(0, X_train.size(0), batch_size):
        indices = permutation[i:i + batch_size]
        batch_x, batch_y = X_train[indices].to(device), y_train[indices].to(device)

        optimizer.zero_grad()
        outputs = classifier(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss:.4f}")

# Evaluate the model
classifier.eval()
with torch.no_grad():
    y_pred = []
    for i in range(0, X_test.size(0), batch_size):
        batch_x = X_test[i:i + batch_size].to(device)
        outputs = classifier(batch_x)
        y_pred.extend(torch.argmax(outputs, dim=1).cpu().numpy())

# Print classification report
y_pred = torch.tensor(y_pred)
print(classification_report(y_test, y_pred, target_names=["Legitimate", "Phishing"]))


```
Epoch 1/10, Loss: 136.1604
Epoch 2/10, Loss: 85.4688
Epoch 3/10, Loss: 76.2344
Epoch 4/10, Loss: 71.6812
Epoch 5/10, Loss: 67.7911
Epoch 6/10, Loss: 64.8238
Epoch 7/10, Loss: 62.8730
Epoch 8/10, Loss: 60.6847
Epoch 9/10, Loss: 59.0898
Epoch 10/10, Loss: 57.4522
              precision    recall  f1-score   support

  Legitimate       0.98      0.97      0.97      2493
    Phishing       0.95      0.96      0.96      1535

    accuracy                           0.97      4028
   macro avg       0.96      0.97      0.96      4028
weighted avg       0.97      0.97      0.97      4028

```

