In [None]:
%%capture
!pip install transformers numpy datasets evaluate

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, concatenate_datasets

import numpy as np
import evaluate

In [None]:
# Load the pre-trained model and the tokenizer
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Load the dataset
dataset = load_dataset("Yelp/yelp_review_full")

# Define the metrics that you want to evaluate on
metrics = evaluate.combine(["recall", "precision", "f1"])
accuracy = evaluate.load("accuracy")

# This function is executed during evaluations
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    evaluations = metrics.compute(predictions=predictions, references=labels, average="macro")
    evaluations.update(accuracy.compute(predictions=predictions, references=labels))
    return evaluations


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/299M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [None]:
# Take only 5k train samples, 500 validation samples and 1k test samples for demonstration purposes
samples_per_class = { class_index: dataset["train"].filter(lambda example: example["label"] == class_index).select(range(1_300)) for class_index in range(5) }

dataset["train"] = concatenate_datasets([samples_per_class[class_index].select(range(1_000)) for class_index in range(5)]).shuffle(seed=42)
dataset["test"] = concatenate_datasets([samples_per_class[class_index].select(range(1_000, 1_200)) for class_index in range(5)]).shuffle(seed=42)
dataset["val"] = concatenate_datasets([samples_per_class[class_index].select(range(1_200, 1_300)) for class_index in range(5)]).shuffle(seed=42)

del samples_per_class

dataset

Filter:   0%|          | 0/650000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/650000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/650000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/650000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/650000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    val: Dataset({
        features: ['label', 'text'],
        num_rows: 500
    })
})

In [None]:
def char_counter(examples):
    examples["char_count"] = len(examples["text"])
    return examples

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Count characters per sample and then sort accordingly - this leads to less padding tokens
dataset = dataset.map(char_counter).sort("char_count")

# Apply tokenizer to all samples
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="trainer_output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Recall,Precision,F1,Accuracy
1,No log,0.976489,0.598,0.600138,0.592292,0.598
2,No log,0.912788,0.592,0.586555,0.587238,0.592


TrainOutput(global_step=314, training_loss=1.0851457923840566, metrics={'train_runtime': 959.447, 'train_samples_per_second': 10.423, 'train_steps_per_second': 0.327, 'total_flos': 2631181424640000.0, 'train_loss': 1.0851457923840566, 'epoch': 2.0})

In [None]:
# Evaluate the model
trainer.evaluate(dataset["test"])

{'eval_loss': 0.9409351944923401,
 'eval_recall': 0.5660000000000001,
 'eval_precision': 0.5558234874785752,
 'eval_f1': 0.5578858589876543,
 'eval_accuracy': 0.566,
 'eval_runtime': 32.7232,
 'eval_samples_per_second': 30.559,
 'eval_steps_per_second': 0.978,
 'epoch': 2.0}