In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Loading and preprocessing the spam dataset
file_path = "/content/drive/MyDrive/spam.csv"
data = pd.read_csv(file_path, encoding='latin1')
data = data[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'text'})

# Encoding the labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])  # 0 for 'ham', 1 for 'spam'

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)

# Define a custom dataset class
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
train_dataset = SpamDataset(train_texts, train_labels, tokenizer)
test_dataset = SpamDataset(test_texts, test_labels, tokenizer)

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1).numpy()
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the trained model and tokenizer
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

# Evaluate the model
evaluation_results = trainer.evaluate()

# Print evaluation results
print("Evaluation Metrics:")
print(evaluation_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0263,0.027026,0.994619,0.993151,0.966667,0.97973


Evaluation Metrics:
{'eval_loss': 0.02702627144753933, 'eval_accuracy': 0.9946188340807175, 'eval_precision': 0.9931506849315068, 'eval_recall': 0.9666666666666667, 'eval_f1': 0.9797297297297297, 'eval_runtime': 506.8464, 'eval_samples_per_second': 2.2, 'eval_steps_per_second': 0.069, 'epoch': 1.0}
