In [1]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch


In [37]:
# device = "cuda" # the device to load the model onto
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print("Device: " + device)
model_location = "./model"

model = DistilBertForSequenceClassification.from_pretrained(
    model_location,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    attn_implementation="flash_attention_2"
).to(device)

tokenizer = DistilBertTokenizer.from_pretrained(model_location)


Device: cuda


In [39]:
def tokenize(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

In [40]:
def predict(text):
    inputs = tokenize(text)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Phishing" if prediction == 1 else "Safe"

In [45]:
test_email = """get your presc [ ription filled in seconds !"""
result = predict(test_email)
print(result)

Phishing


In [44]:
test_safe = """request submitted : access request for leann . walton @ enron . com you have received this email because you are listed as an alternate data approver . please click approval to review and act upon this request . request id : 000000000005168 approver : stinson . gibner @ enron . com request create date : 10 / 18 / 00 2 : 06 : 37 pm requested for : leann . walton @ enron . com resource name : \ \ enehou \ houston \ common \ research - [ read / write ] resource type : directory"""
result = predict(test_safe)
print(result)

Safe
