In [1]:
!pip install tiktoken



In [2]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import numpy as np
import tiktoken
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2Model, GPT2Config


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ethancratchley/email-phishing-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Shivansh Kalra\.cache\kagglehub\datasets\ethancratchley\email-phishing-dataset\versions\1


In [4]:
data_file_path="C:\\Development\\LLMProj\\NewProj\\email_phishing_data.csv"

In [5]:
import pandas as pd
df = pd.read_csv(data_file_path, sep=",")
df

Unnamed: 0,num_words,num_unique_words,num_stopwords,num_links,num_unique_domains,num_email_addresses,num_spelling_errors,num_urgent_keywords,label
0,140,94,52,0,0,0,0,0,0
1,5,5,1,0,0,0,0,0,0
2,34,32,15,0,0,0,0,0,0
3,6,6,2,0,0,0,0,0,0
4,9,9,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
524841,782,327,301,2,2,2,52,1,0
524842,36,30,11,0,0,0,4,0,1
524843,61,46,11,0,0,0,3,0,0
524844,213,136,89,0,0,0,18,0,0


In [6]:
print(df["label"].value_counts())

label
0    517897
1      6949
Name: count, dtype: int64


In [7]:
df.columns

Index(['num_words', 'num_unique_words', 'num_stopwords', 'num_links',
       'num_unique_domains', 'num_email_addresses', 'num_spelling_errors',
       'num_urgent_keywords', 'label'],
      dtype='object')

In [8]:
def create_balanced_dataset(df):

    num_spam = df[df["label"] == 1].shape[0]
    ham_subset = df[df["label"] == 0].sample(num_spam, random_state=123)

    balanced_df = pd.concat([ham_subset, df[df["label"] == 1]])

    return balanced_df


balanced_df = create_balanced_dataset(df)
print(balanced_df["label"].value_counts())


label
0    6949
1    6949
Name: count, dtype: int64


In [9]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [10]:
df.columns

Index(['num_words', 'num_unique_words', 'num_stopwords', 'num_links',
       'num_unique_domains', 'num_email_addresses', 'num_spelling_errors',
       'num_urgent_keywords', 'label'],
      dtype='object')

In [11]:
import torch
torch.manual_seed(193)
np.random.seed(193)

In [12]:
import torch
import pandas as pd
from torch.utils.data import Dataset
import numpy as np

class PhishingDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=128):
        """
        Initialize the PhishingDataset.

        Args:
            csv_file (str): Path to the CSV file containing features and labels.
            tokenizer: Tokenizer used to encode the constructed text.
            max_length (int): Maximum token length for the model input.
        """
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Feature columns in the dataset
        self.feature_cols = [
            'num_words', 'num_unique_words', 'num_stopwords', 'num_links',
            'num_unique_domains', 'num_email_addresses', 'num_spelling_errors',
            'num_urgent_keywords'
        ]

        # Check if all expected columns exist and fill missing values with 0
        for col in self.feature_cols + ['label']:
            if col not in self.data.columns:
                self.data[col] = 0  # or raise KeyError(f"Column '{col}' not found in dataset")
            # Fill NaN values with 0
            self.data[col] = self.data[col].fillna(0)

        print(f"Loaded {len(self.data)} samples from {csv_file}")

    def __len__(self):
        return len(self.data)

    def _create_text_representation(self, row):
        """
        Create a textual representation of the features for the transformer model.

        Args:
            row: A pandas Series containing the features

        Returns:
            str: A textual representation of the features
        """
        text = (
            f"Email characteristics: "
            f"Contains {row['num_words']} words with {row['num_unique_words']} unique words. "
            f"Has {row['num_stopwords']} stopwords. "
            f"Includes {row['num_links']} links to {row['num_unique_domains']} different domains. "
            f"Contains {row['num_email_addresses']} email addresses. "
            f"Has {row['num_spelling_errors']} spelling errors. "
            f"Contains {row['num_urgent_keywords']} urgent keywords."
        )
        return text

    def __getitem__(self, idx):
        """
        Get tokenized text and label for a sample at index idx.

        Returns:
            tuple: (input_ids, attention_mask, label_tensor)
        """
        # Get the sample
        sample = self.data.iloc[idx]

        # Create text representation from features
        text = self._create_text_representation(sample)

        # Ensure we always have a valid text string
        if not isinstance(text, str) or not text:
            text = "Email characteristics: Empty or invalid email."
            print(f"Warning: Invalid text at index {idx}. Using default text.")

        # Encode the text for the transformer - with explicit error handling
        try:
            encoding = self.tokenizer.encode(text)
            print(f"DEBUG: Encoding type: {type(encoding)}, value: {encoding}")

            # Check if encoding is None or not a list/sequence
            if encoding is None:
                print(f"Warning: Tokenizer returned None encoding for index {idx}, text: {text}")
                encoding = [0]  # Default fallback
            elif not isinstance(encoding, (list, tuple, np.ndarray)):
                print(f"Warning: Tokenizer returned non-sequence encoding: {type(encoding)} for index {idx}")
                # Try to convert to list if it's something else
                try:
                    encoding = list(encoding)
                except:
                    encoding = [0]  # Default fallback
        except Exception as e:
            print(f"Error encoding text at index {idx}: {e}")
            print(f"Text was: {text}")
            encoding = [0]  # Default fallback

        # Make absolutely sure encoding is a list before checking length
        if not isinstance(encoding, list):
            try:
                encoding = list(encoding)
            except:
                encoding = [0]  # Last resort fallback

        # Ensure encoding is not empty
        if not encoding:
            encoding = [0]

        # Truncate or pad to max_length
        if len(encoding) > self.max_length:
            encoding = encoding[:self.max_length]
        else:
            # Pad with EOS token ID or 0 depending on tokenizer
            pad_token = 0  # Default
            if hasattr(self.tokenizer, 'pad_token_id') and self.tokenizer.pad_token_id is not None:
                pad_token = self.tokenizer.pad_token_id
            encoding = encoding + [pad_token] * (self.max_length - len(encoding))

        # Convert to tensors
        input_ids = torch.tensor(encoding, dtype=torch.long)
        attention_mask = torch.ones_like(input_ids)  # All tokens are real (not padding)

        # Extract label
        label = torch.tensor(sample['label'], dtype=torch.long)

        return input_ids, attention_mask, label

In [13]:
def calc_accuracy_loader(data_loader, model, device):
    """
    Calculate accuracy over a dataloader.

    Args:
        data_loader: DataLoader to iterate over
        model: Model to evaluate
        device: Device to run on

    Returns:
        float: Accuracy (0-1)
    """
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total if total > 0 else 0

def train_classifier(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq=1, eval_iter=None):
    """
    Train a classifier model.

    Args:
        model: The model to train
        train_loader: DataLoader for training data
        val_loader: DataLoader for validation data
        optimizer: Optimizer for training
        device: Device to train on ('cuda' or 'cpu')
        num_epochs: Number of epochs to train
        eval_freq: How often to evaluate on validation set (in epochs)
        eval_iter: Number of iterations to evaluate on (None = all)

    Returns:
        tuple: (trained model, training losses, validation accuracies)
    """
    train_losses = []
    val_accuracies = []

    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        # Unpack all three items from your dataset
        for input_ids, attention_mask, labels in train_loader:
            optimizer.zero_grad()  # Reset loss gradients from previous batch iteration

            # Move inputs to the device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            # Forward pass through the model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits if hasattr(outputs, 'logits') else outputs

            # Calculate loss
            loss = F.cross_entropy(logits, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        # Evaluation phase
        if (epoch + 1) % eval_freq == 0:
            accuracy = evaluate_classifier(model, val_loader, device, max_iter=eval_iter)
            val_accuracies.append(accuracy)
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.4f}")
        else:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

    return model, train_losses, val_accuracies

In [14]:
def calc_loss_batch(input_data, labels, model, device):
    """
    Calculate loss for a batch of data.

    Args:
        input_data: Tuple of (input_ids, attention_mask) or just input tensor
        labels: Target labels
        model: The model to use
        device: Device to use for computation

    Returns:
        torch.Tensor: Loss value
    """
    # Move data to the correct device
    if isinstance(input_data, tuple):
        # Unpack the tuple if it contains multiple inputs
        input_ids, attention_mask = input_data
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # Forward pass with both inputs
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    else:
        # Single input tensor
        input_data = input_data.to(device)
        labels = labels.to(device)

        # Forward pass with single input
        outputs = model(input_data)

    # Extract logits from outputs
    logits = outputs.logits if hasattr(outputs, 'logits') else outputs

    # Calculate loss
    loss = F.cross_entropy(logits, labels)

    return loss

In [15]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [16]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    """
    Calculate average loss over a dataloader.

    Args:
        data_loader: DataLoader to iterate over
        model: Model to evaluate
        device: Device to run on
        num_batches: Number of batches to use (None = all)

    Returns:
        float: Average loss
    """
    model.eval()
    total_loss = 0
    count = 0

    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            loss = calc_loss_batch(batch[0], batch[2], model, device)  # Corrected call
            total_loss += loss
            count += 1

            if num_batches is not None and i >= num_batches - 1:
                break

    return total_loss / count if count > 0 else float('inf')

In [17]:
def train_classifier(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter):
    # Initialize lists to track losses and examples seen
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_ids, attention_mask, labels in train_loader:
            optimizer.zero_grad()  # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch((input_ids, attention_mask), labels, model, device)
            loss.backward()  # Calculate loss gradients
            optimizer.step()  # Update model weights using loss gradients

            examples_seen += input_ids.shape[0]  # Track examples
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Calculate accuracy after each epoch
        train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
        print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy*100:.2f}%")
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)

    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [18]:
#ADDED
import torch
import torch.nn as nn

class FeatureBasedClassifier(nn.Module):
    def __init__(self, input_size=8, hidden_sizes=[32, 16], num_classes=2):
        """
        A simple MLP classifier for phishing detection based on extracted features.

        Args:
            input_size (int): Number of input features
            hidden_sizes (list): List of hidden layer sizes
            num_classes (int): Number of output classes
        """
        super().__init__()

        layers = []
        prev_size = input_size

        # Create hidden layers
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            prev_size = hidden_size

        # Output layer
        layers.append(nn.Linear(prev_size, num_classes))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        """
        Forward pass through the network.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, input_size]

        Returns:
            torch.Tensor: Output logits of shape [batch_size, num_classes]
        """
        return self.model(x)

In [19]:
import torch
import torch.nn as nn
from transformers import AutoModel
class SmolLMClassifier(nn.Module):
    def __init__(self, pretrained_model_name="HuggingFaceTB/SmolLM2-135M", num_classes=2):
        """
        Initialize a SmolLM based classifier.

        Args:
            pretrained_model_name (str): Name of the pretrained model to load
            num_classes (int): Number of output classes
        """
        super().__init__()

        # Load the pretrained model
        self.transformer = AutoModel.from_pretrained(pretrained_model_name)

        # Configure the classifier head
        hidden_size = self.transformer.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size // 2, num_classes)
        )

    def forward(self, input_ids, attention_mask=None):
        """
        Forward pass through the network.

        Args:
            input_ids (torch.Tensor): Token IDs
            attention_mask (torch.Tensor): Attention mask

        Returns:
            torch.Tensor: Output logits of shape [batch_size, num_classes]
        """
        # Get the transformer outputs
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Use the last hidden state of the last token for classification
        last_hidden_state = transformer_outputs.last_hidden_state
        sequence_output = last_hidden_state[:, -1, :]  # Use the last token

        # Pass through the classifier head
        logits = self.classifier(sequence_output)

        return logits

In [20]:
def plot_values(epochs, examples_seen, train_values, val_values, label="loss"):
    """
    Plot training progress.

    Args:
        epochs: X values for epochs
        examples_seen: X values for examples seen
        train_values: Y values for training
        val_values: Y values for validation
        label: Label to use (loss or accuracy)
    """
    plt.figure(figsize=(12, 5))

    # Plot against epochs
    plt.subplot(1, 2, 1)
    plt.plot(epochs.numpy(), train_values, label=f'Training {label}')
    plt.plot(epochs.numpy(), val_values, label=f'Validation {label}')
    plt.xlabel('Epochs')
    plt.ylabel(label.capitalize())
    plt.title(f'{label.capitalize()} vs. Epochs')
    plt.legend()

    # Plot against examples seen
    plt.subplot(1, 2, 2)
    plt.plot(examples_seen.numpy(), train_values, label=f'Training {label}')
    plt.plot(examples_seen.numpy(), val_values, label=f'Validation {label}')
    plt.xlabel('Examples Seen')
    plt.ylabel(label.capitalize())
    plt.title(f'{label.capitalize()} vs. Examples Seen')
    plt.legend()

    plt.tight_layout()
    plt.savefig(f'phishing_{label}_plot.png')
    plt.show()



In [21]:
from transformers import AutoModel, AutoConfig

class SmolLMClassifier(torch.nn.Module):
    def __init__(self, pretrained_model_name="HuggingFaceTB/SmolLM2-135M", num_classes=2):
        super(SmolLMClassifier, self).__init__()
        config = AutoConfig.from_pretrained(pretrained_model_name)
        self.transformer = AutoModel.from_pretrained(pretrained_model_name, config=config)

        hidden_size = config.hidden_size if hasattr(config, 'hidden_size') else config.n_embd
        self.classifier = torch.nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        # Use the [CLS] token representation (first token)
        cls_representation = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_representation)
        return logits


In [22]:
def classify_email(features_dict, model, tokenizer, device, max_length=128):
    """
    Classify email based on its features.

    Args:
        features_dict (dict): Dictionary of feature values
        model: Trained model
        tokenizer: Tokenizer for encoding text
        device: Device to run on
        max_length: Maximum sequence length

    Returns:
        int: Predicted class (0 = ham, 1 = phishing)
    """
    model.eval()

    # Create text representation from features
    text = (
        f"Email characteristics: "
        f"Contains {features_dict.get('num_words', 0)} words with {features_dict.get('num_unique_words', 0)} unique words. "
        f"Has {features_dict.get('num_stopwords', 0)} stopwords. "
        f"Includes {features_dict.get('num_links', 0)} links to {features_dict.get('num_unique_domains', 0)} different domains. "
        f"Contains {features_dict.get('num_email_addresses', 0)} email addresses. "
        f"Has {features_dict.get('num_spelling_errors', 0)} spelling errors. "
        f"Contains {features_dict.get('num_urgent_keywords', 0)} urgent keywords."
    )

    # Encode the text
    encoding = tokenizer.encode(text)

    # Truncate or pad to max_length
    if len(encoding) > max_length:
        encoding = encoding[:max_length]
    else:
        # Pad with EOS token ID or 0 depending on tokenizer
        pad_token = 0  # Default
        if hasattr(tokenizer, 'pad_token_id'):
            pad_token = tokenizer.pad_token_id
        encoding = encoding + [pad_token] * (max_length - len(encoding))

    # Convert to tensors
    input_ids = torch.tensor(encoding, dtype=torch.long).unsqueeze(0).to(device)
    attention_mask = torch.ones_like(input_ids).to(device)

    # Get prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)

    return predicted.item()

In [None]:
import torch
from torch.utils.data import DataLoader
import time
import tiktoken

import torch.nn.functional as F




def main():
    # Determine device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load tokenizer (using the same as your instructor for consistency)
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")  # Change to this

    # Load the datasets
    print("Loading datasets...")
    train_dataset = PhishingDataset(csv_file="train.csv", tokenizer=tokenizer)
    val_dataset = PhishingDataset(csv_file="validation.csv", tokenizer=tokenizer)
    test_dataset = PhishingDataset(csv_file="test.csv", tokenizer=tokenizer)

    # Create data loaders
    batch_size = 8
    num_workers = 2

    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        drop_last=True,
    )

    val_loader = DataLoader(
        dataset=val_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        drop_last=False,
    )

    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        drop_last=False,
    )

    print(f"{len(train_loader)} training batches")
    print(f"{len(val_loader)} validation batches")
    print(f"{len(test_loader)} test batches")

    # Initialize the model - choose one of these:
    # Option 1: GPT-2
    model = SmolLMClassifier(pretrained_model_name="HuggingFaceTB/SmolLM2-135M", num_classes=2)

    # Option 2: SmolLM2
    # model = SmolLMClassifier(pretrained_model_name="HuggingFaceTB/SmolLM2-135M", num_classes=2)

    model.to(device)

    # Freeze most parameters
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze only the classifier and final transformer layer
    for param in model.classifier.parameters():
        param.requires_grad = True

    if hasattr(model.transformer, 'h'):
        for param in model.transformer.h[-1].parameters():
            param.requires_grad = True
    if hasattr(model.transformer, 'ln_f'):
        for param in model.transformer.ln_f.parameters():
            param.requires_grad = True

    # Initial evaluation
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)
        test_loss = calc_loss_loader(test_loader, model, device, num_batches=5)

    print(f"Initial Training loss: {train_loss:.3f}")
    print(f"Initial Validation loss: {val_loss:.3f}")
    print(f"Initial Test loss: {test_loss:.3f}")

    # Train the model
    start_time = time.time()

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)

    num_epochs = 1
    train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier(
        model, train_loader, val_loader, optimizer, device,
        num_epochs=num_epochs, eval_freq=1, eval_iter=1, max_batches_per_epoch=10
    )

    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")

    # Plot training progress
    epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
    examples_seen_tensor = torch.linspace(0, examples_seen, len(train_losses))

    plot_values(epochs_tensor, examples_seen_tensor, train_losses, val_losses)
    plot_values(epochs_tensor, examples_seen_tensor, train_accs, val_accs, label="accuracy")

    # Final evaluation
    train_accuracy = calc_accuracy_loader(train_loader, model, device)
    val_accuracy = calc_accuracy_loader(val_loader, model, device)
    test_accuracy = calc_accuracy_loader(test_loader, model, device)

    print(f"Final Training accuracy: {train_accuracy*100:.2f}%")
    print(f"Final Validation accuracy: {val_accuracy*100:.2f}%")
    print(f"Final Test accuracy: {test_accuracy*100:.2f}%")

    # Save the model
    torch.save(model.state_dict(), "phishing_transformer_classifier.pth")
    print("Model saved to phishing_transformer_classifier.pth")

    # Example predictions
    phishing_features = {
        'num_words': 150,
        'num_unique_words': 90,
        'num_stopwords': 40,
        'num_links': 3,
        'num_unique_domains': 2,
        'num_email_addresses': 1,
        'num_spelling_errors': 5,
        'num_urgent_keywords': 4
    }

    prediction = classify_email(phishing_features, model, tokenizer, device, max_length=train_dataset.max_length)
    print(f"Phishing sample prediction (expected 1): {prediction}")

    ham_features = {
        'num_words': 120,
        'num_unique_words': 100,
        'num_stopwords': 30,
        'num_links': 0,
        'num_unique_domains': 0,
        'num_email_addresses': 0,
        'num_spelling_errors': 0,
        'num_urgent_keywords': 0
    }

    prediction = classify_email(ham_features, model, tokenizer, device, max_length=train_dataset.max_length)
    print(f"Ham sample prediction (expected 0): {prediction}")

if __name__ == "__main__":
    main()

Using device: cuda




Loading datasets...
Loaded 9728 samples from train.csv
Loaded 1389 samples from validation.csv
Loaded 2781 samples from test.csv
1216 training batches
174 validation batches
348 test batches


