In [1]:
# Show python version
!python --version

Python 3.8.18


In [2]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Device: {device}")

Device: mps


In [3]:
# Train a model on the sms_spam dataset

from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer

import numpy as np

splits = ["train", "test"]

# The sms_spam dataset only has a train split, so we use the train_test_split method to split it into train and test
dataset = load_dataset("sms_spam", split="train").train_test_split(test_size=0.2, shuffle=True, seed=23)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(lambda x: tokenizer(x["sms"], truncation=True), batched=True)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "not spam", 1: "spam"}
label2id = {"not spam": 0, "spam": 1}

# https://huggingface.co/transformers/v3.0.2/model_doc/auto.html#automodelforsequenceclassification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

# Unfreeze all the model parameters.
for param in model.parameters():
    param.requires_grad = True

# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer 
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/sentiment_analysis",
        learning_rate=2e-5,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,

    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/558 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/70 [00:00<?, ?it/s]

{'eval_loss': 0.04906448349356651, 'eval_accuracy': 0.9883408071748879, 'eval_runtime': 4.1539, 'eval_samples_per_second': 268.42, 'eval_steps_per_second': 16.851, 'epoch': 1.0}
{'loss': 0.0585, 'learning_rate': 2.078853046594982e-06, 'epoch': 1.79}


  0%|          | 0/70 [00:00<?, ?it/s]

{'eval_loss': 0.04614919051527977, 'eval_accuracy': 0.989237668161435, 'eval_runtime': 2.4036, 'eval_samples_per_second': 463.895, 'eval_steps_per_second': 29.123, 'epoch': 2.0}
{'train_runtime': 94.0272, 'train_samples_per_second': 94.845, 'train_steps_per_second': 5.934, 'train_loss': 0.054600649623460666, 'epoch': 2.0}


TrainOutput(global_step=558, training_loss=0.054600649623460666, metrics={'train_runtime': 94.0272, 'train_samples_per_second': 94.845, 'train_steps_per_second': 5.934, 'train_loss': 0.054600649623460666, 'epoch': 2.0})

In [5]:
# Show some examples from the test set

import pandas as pd
df = pd.DataFrame(tokenized_dataset["test"])
df = df[['sms', 'label']]
df = pd.concat(
    [
        df[df['label'] == 0].head(5),
        df[df['label'] == 1].head(5)
    ]
)
pd.set_option('display.max_colwidth', 200)
df

Unnamed: 0,sms,label
0,Yup... Hey then one day on fri we can ask miwa and jiayin take leave go karaoke \n,0
1,Happy new years melody!\n,0
2,Think I could stop by in like an hour or so? My roommate's looking to stock up for a trip\n,0
3,I can make lasagna for you... vodka...\n,0
4,No rushing. I'm not working. I'm in school so if we rush we go hungry.\n,0
22,PRIVATE! Your 2003 Account Statement for shows 800 un-redeemed S. I. M. points. Call 08715203652 Identifier Code: 42810 Expires 29/10/0\n,1
31,URGENT! We are trying to contact U. Todays draw shows that you have won a £800 prize GUARANTEED. Call 09050003091 from land line. Claim C52. Valid 12hrs only\n,1
48,"I want some cock! My hubby's away, I need a real man 2 satisfy me. Txt WIFE to 89938 for no strings action. (Txt STOP 2 end, txt rec £1.50ea. OTBox 731 LA1 7WS. )\n",1
49,Your unique user ID is 1172. For removal send STOP to 87239 customer services 08708034412\n,1
54,Double your mins & txts on Orange or 1/2 price linerental - Motorola and SonyEricsson with B/Tooth FREE-Nokia FREE Call MobileUpd8 on 08000839402 or2optout/HV9D\n,1
