# 1. Activate GPU and Install Dependencies

In [None]:
# Activate GPU for faster training by clicking on 'Runtime' > 'Change runtime type' and then selecting GPU as the Hardware accelerator
# Then check if GPU is available
import torch
torch.cuda.is_available()

In [None]:
# Install required libraries
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs

#2. Preprocess data

In [110]:
# Load data
from datasets import load_dataset

ds = load_dataset("NotShrirang/email-spam-filter", split="train")

In [112]:
ds = ds.rename_column('label', 'label1')
ds = ds.rename_column('label_num', 'label')

In [113]:
train_val_split = ds.train_test_split(test_size=0.2)
train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

In [115]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = val_dataset.map(preprocess_function, batched=True)

In [117]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3. Training the model

In [None]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
!pip install torcheval

In [119]:
# Define the evaluation metrics
import numpy as np
from torcheval.metrics import MulticlassF1Score,BinaryAccuracy

def compute_metrics(eval_pred):
    metric_ac = BinaryAccuracy()
    metric_f1 = MulticlassF1Score(num_classes=2)

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric_ac.update(torch.tensor(predictions), torch.tensor(labels))
    metric_f1.update(torch.tensor(predictions), torch.tensor(labels))
    return {"accuracy": metric_ac.compute(), "f1": metric_f1.compute()}

In [None]:
# Log in to your Hugging Face account
# Get your API token here https://huggingface.co/settings/token
from huggingface_hub import notebook_login

notebook_login()

In [120]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-spam-model-5000-samples"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Compute the evaluation metrics
trainer.evaluate()

# 4. Analyzing new data with the model

In [None]:
# Upload the model to the Hub
trainer.push_to_hub()

In [None]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="finetuning-spam-model-5000-samples")

sentiment_model([ds[40]['text'], "This movie sucks!"])