# Finetune bert classifier for sentiment classification
Example from https://huggingface.co/docs/transformers/training

# Development environment


In [1]:
! pip install transformers[torch]
! pip install datasets
! pip install evaluate
! pip install scikit-learn
! pip install wandb==0.16.2 




In [2]:
import warnings
warnings.filterwarnings("ignore")

import transformers
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
import wandb
import time

import numpy as np
import evaluate


2024-01-12 15:01:52.415995: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-12 15:01:52.465164: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Login to Weights and Biases


In [3]:
wandb.login()


[34m[1mwandb[0m: Currently logged in as: [33moliviamoveon[0m ([33molivia-liu[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
wandb.init(
      # Set the project where this run will be logged
      project="sutd-mlops-project", 
      # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
      name=f"experiment_session3_run_1", 
      # Track hyperparameters and run metadata
      config={
          "learning_rate": 2e-5,
          "weight_decay": 0.01,
          "num_train_epochs": 2,
          "train_subsample_size": 1000,
          "architecture": "distilbert",
          "dataset_name": "rotten_tomatoes",
          "model_name": "distilbert-base-uncased"
      })
config = wandb.config

# Prepare data


In [5]:
dataset = load_dataset(config.dataset_name)
dataset["train"][0]

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1}

In [6]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenized_datasets = dataset.map(
                            lambda examples: tokenizer(examples["text"], padding="max_length", truncation=True), 
                            batched=True)

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [7]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(config.train_subsample_size))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(100))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

# Train the model


In [8]:
num_labels = len(np.unique(dataset['train']['label']))
model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
metric = evaluate.load("accuracy")

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
training_args = TrainingArguments(
    output_dir=".",
    report_to="wandb",
    evaluation_strategy="epoch",
    learning_rate=config.learning_rate,
    weight_decay=config.weight_decay,
    num_train_epochs=config.num_train_epochs,
    logging_steps=20)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4473,0.431691,0.78
2,0.2842,0.448995,0.77


TrainOutput(global_step=250, training_loss=0.4262775707244873, metrics={'train_runtime': 89.3885, 'train_samples_per_second': 22.374, 'train_steps_per_second': 2.797, 'total_flos': 264934797312000.0, 'train_loss': 0.4262775707244873, 'epoch': 2.0})

# Test the model


In [14]:
# Accuracy on training set
trainer.evaluate(small_train_dataset)

{'eval_loss': 0.19412970542907715,
 'eval_accuracy': 0.932,
 'eval_runtime': 15.1854,
 'eval_samples_per_second': 65.853,
 'eval_steps_per_second': 8.232,
 'epoch': 2.0}

In [15]:
# Accuracy on validation set
trainer.evaluate(small_eval_dataset)

{'eval_loss': 0.4489947557449341,
 'eval_accuracy': 0.77,
 'eval_runtime': 1.5288,
 'eval_samples_per_second': 65.411,
 'eval_steps_per_second': 8.503,
 'epoch': 2.0}

In [16]:
# Accuracy on test set
trainer.evaluate(small_test_dataset)


{'eval_loss': 0.5381070971488953,
 'eval_accuracy': 0.77,
 'eval_runtime': 1.5318,
 'eval_samples_per_second': 65.282,
 'eval_steps_per_second': 8.487,
 'epoch': 2.0}

In [17]:
# accuracy of the whole test set - for fair comparison with the classification performance achieved by SGD in previous sessions
def predict(tokenized_test_data, trainer):
    output_array = trainer.predict(tokenized_test_data)[0]
    pred_prob = np.exp(output_array)/np.sum(np.exp(output_array), axis = 1)[..., None]
    pred = np.argmax(pred_prob, axis = 1)
    return pred_prob, pred 

pred_prob, pred  = predict(tokenized_datasets["test"], trainer)
accuracy = np.sum(pred == dataset["test"]['label'])/len(dataset["test"]['label'])
print(f"Accuracy: {accuracy}")
wandb.sklearn.plot_precision_recall(dataset["test"]['label'], pred_prob, ["negative", "positive"])



Accuracy: 0.8208255159474672


In [18]:
wandb.finish()


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▁█▁▁
eval/loss,▆▆▁▆█
eval/runtime,▁▁█▁▁
eval/samples_per_second,▁▂█▅▄
eval/steps_per_second,▆▇▁██
train/epoch,▁▂▂▃▃▄▄▅▅▆▆▇██████
train/global_step,▁▂▂▃▃▄▄▅▅▆▆▇███████
train/learning_rate,█▇▇▆▅▅▄▄▃▂▂▁
train/loss,██▆▅▅▄▃▂▃▂▁▂
train/total_flos,▁

0,1
eval/accuracy,0.77
eval/loss,0.53811
eval/runtime,1.5318
eval/samples_per_second,65.282
eval/steps_per_second,8.487
train/epoch,2.0
train/global_step,250.0
train/learning_rate,0.0
train/loss,0.2842
train/total_flos,264934797312000.0


# What to try next

- train and evaluate with the complete training and test dataset instead of a sample
- experiment with different training parameters (number of epochs, optimizers, batch size, learning rate schedule, ...)
- compare DistilBERT vs the full BERT model: https://huggingface.co/bert-base-uncased
- compare the results with the scikit model from the previous notebook. What is the cost-benefit trade off between deep learning and traditional ML?
- Check out this more detailed sentiment tutorial on Huggingface https://huggingface.co/blog/sentiment-analysis-python