In [None]:
!pip install torch
!pip install accelerate datasets transformers evaluate
import torch, transformers, datasets, accelerate, evaluate
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from accelerate.utils.memory import clear_device_cache
from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names

In [None]:
dataset = load_dataset("yelp_review_full")

In [None]:
get_dataset_split_names("yelp_review_full")

In [None]:
dataset["train"].features # features of the 'train' dataset

In [None]:
dataset["train"][17] #random example

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5, torch_dtype="auto")

In [None]:
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(1000))

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
small_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
small_eval_dataset = small_eval_dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer")

In [None]:
metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)


In [None]:
training_args = TrainingArguments(
    report_to="none",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=16, # we want to train this and don't touch eval
    per_device_eval_batch_size=16,
    weight_decay=0.01, # regularisor - reduce overfitting (same idea as dropout but mechanism diff)
    warmup_ratio=0.1,
    gradient_accumulation_steps=2, # similar to batch size 
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    logging_dir='.logs',
    logging_steps=10,
    output_dir="test_trainer",
    eval_strategy="epoch"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

In [None]:
trainer.train()