In [34]:
from datasets import load_dataset
import datasets
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
import evaluate

In [98]:
dataset = load_dataset("Deysi/spam-detection-dataset")

In [99]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [100]:
def label_to_bin(data):
    if data["label"] == "spam":
        data["label"] = 1
    else:
        data["label"] = 0
    return data

def tokenize(data):
    return tokenizer(data["text"], padding=True, truncation=True, return_tensors="pt")

In [101]:
dataset = dataset.map(label_to_bin)
tokenized_dataset = dataset.map(tokenize, batched=True)


[A
[A
Map: 100%|██████████| 8175/8175 [00:00<00:00, 36652.21 examples/s]

Map: 100%|██████████| 2725/2725 [00:00<00:00, 37868.78 examples/s]

[A
[A
[A
[A
[A
[A
[A
  0%|          | 0/3 [06:56<?, ?it/s]

Map: 100%|██████████| 8175/8175 [00:01<00:00, 4374.46 examples/s]
Map: 100%|██████████| 2725/2725 [00:00<00:00, 4900.43 examples/s]


In [107]:
from transformers import AutoModelForSequenceClassification

In [108]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [109]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [110]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")

In [113]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

  0%|          | 0/3 [08:12<?, ?it/s]


In [114]:
trainer.train()

  0%|          | 0/3066 [00:00<?, ?it/s]

 16%|█▋        | 500/3066 [02:45<14:03,  3.04it/s]

{'loss': 0.0524, 'grad_norm': 0.009051419794559479, 'learning_rate': 4.184605348988911e-05, 'epoch': 0.49}


 33%|███▎      | 1000/3066 [05:35<12:15,  2.81it/s]

{'loss': 0.0142, 'grad_norm': 0.003744924906641245, 'learning_rate': 3.3692106979778215e-05, 'epoch': 0.98}


 33%|███▎      | 1022/3066 [05:45<12:03,  2.82it/s]
 33%|███▎      | 1022/3066 [06:21<12:03,  2.82it/s]

{'eval_loss': 0.0239860862493515, 'eval_accuracy': 0.996697247706422, 'eval_runtime': 36.1652, 'eval_samples_per_second': 75.349, 'eval_steps_per_second': 9.429, 'epoch': 1.0}


 49%|████▉     | 1500/3066 [09:03<08:51,  2.95it/s]  

{'loss': 0.006, 'grad_norm': 0.0015859409468248487, 'learning_rate': 2.553816046966732e-05, 'epoch': 1.47}


 65%|██████▌   | 2000/3066 [11:55<06:10,  2.88it/s]

{'loss': 0.0085, 'grad_norm': 0.0011051242472603917, 'learning_rate': 1.7384213959556427e-05, 'epoch': 1.96}


                                                   
 67%|██████▋   | 2044/3066 [12:54<05:31,  3.09it/s]

{'eval_loss': 0.015008148737251759, 'eval_accuracy': 0.9977981651376147, 'eval_runtime': 35.1872, 'eval_samples_per_second': 77.443, 'eval_steps_per_second': 9.691, 'epoch': 2.0}


 82%|████████▏ | 2500/3066 [15:26<03:08,  3.01it/s]  

{'loss': 0.0018, 'grad_norm': 0.0008961500716395676, 'learning_rate': 9.230267449445531e-06, 'epoch': 2.45}


 98%|█████████▊| 3000/3066 [18:22<00:22,  2.87it/s]

{'loss': 0.0, 'grad_norm': 0.0007178701343946159, 'learning_rate': 1.076320939334638e-06, 'epoch': 2.94}


                                                   
100%|██████████| 3066/3066 [19:37<00:00,  2.60it/s]

{'eval_loss': 0.00936694722622633, 'eval_accuracy': 0.998165137614679, 'eval_runtime': 34.0806, 'eval_samples_per_second': 79.957, 'eval_steps_per_second': 10.006, 'epoch': 3.0}
{'train_runtime': 1177.3184, 'train_samples_per_second': 20.831, 'train_steps_per_second': 2.604, 'train_loss': 0.013529866542696008, 'epoch': 3.0}





TrainOutput(global_step=3066, training_loss=0.013529866542696008, metrics={'train_runtime': 1177.3184, 'train_samples_per_second': 20.831, 'train_steps_per_second': 2.604, 'total_flos': 6452798632704000.0, 'train_loss': 0.013529866542696008, 'epoch': 3.0})

In [125]:
import torch

In [145]:
text = "Hi, this is Joe, please call me back"
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

In [146]:
with torch.no_grad():
    model = model.to("cuda")
    inputs = {name: tensor.to("cuda") for name, tensor in inputs.items()}

    outputs = model(**inputs)

In [147]:
logits = outputs.logits

In [148]:
prob = torch.softmax(logits, dim=1)
preds = torch.argmax(prob, dim=1)
is_spam = preds.item() == 1
print(f"Predicted: {'spam' if is_spam else 'ham'}")

Predicted: ham


In [140]:
torch.save(model.state_dict(), "data/spam_dectector.pth")