In [18]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

df = pd.read_csv("email.csv")[['Category', 'Message']]
df.columns = ['Category', 'Message']
df = df.dropna()
df['Category'] = df['Category'].astype(str).str.strip().str.lower()
df = df[df['Category'].isin(['ham', 'spam'])]
df['label'] = df['Category'].map({'ham': 0, 'spam': 1})

df = df.reset_index(drop=True)
df['text'] = df['Message'].astype(str)

In [19]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42)

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)


In [21]:
class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)  # Critical for CrossEntropyLoss
        }

    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, train_labels)
test_dataset = EmailDataset(test_encodings, test_labels)


In [22]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./bert_spam_output',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0838,0.079211
2,0.0002,0.048557
3,0.0001,0.048952


TrainOutput(global_step=1674, training_loss=0.03695307712157156, metrics={'train_runtime': 749.8513, 'train_samples_per_second': 17.832, 'train_steps_per_second': 2.232, 'total_flos': 1635347236816440.0, 'train_loss': 0.03695307712157156, 'epoch': 3.0})

In [24]:
print(df['Category'].unique())


['ham' 'spam']


In [25]:

trainer.evaluate()

preds_output = trainer.predict(test_dataset)
preds = np.argmax(preds_output.predictions, axis=1)

print(classification_report(test_labels, preds))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.97      0.97      0.97       149

    accuracy                           0.99      1115
   macro avg       0.98      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [26]:
model.save_pretrained("bert_spam_model")
tokenizer.save_pretrained("bert_spam_model")


('bert_spam_model/tokenizer_config.json',
 'bert_spam_model/special_tokens_map.json',
 'bert_spam_model/vocab.txt',
 'bert_spam_model/added_tokens.json')

In [27]:
!zip -r bert_spam_model.zip bert_spam_model/


  adding: bert_spam_model/ (stored 0%)
  adding: bert_spam_model/special_tokens_map.json (deflated 42%)
  adding: bert_spam_model/vocab.txt (deflated 53%)
  adding: bert_spam_model/tokenizer_config.json (deflated 75%)
  adding: bert_spam_model/config.json (deflated 49%)
  adding: bert_spam_model/model.safetensors (deflated 7%)


In [28]:
from google.colab import files
files.download('bert_spam_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>