Environment setup.

In [None]:
#%pip install datasets
#%pip install transformers

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
colab_data_path = "/content/drive/MyDrive/Seminar2/data/ribe_512x768/"
colab_dir = "/content/drive/MyDrive/Seminar2/model/"
model_name_or_path = "google/vit-base-patch16-224-in21k"

Imports.

In [4]:
import numpy as np
import torch
from torchvision import transforms
from datasets import load_dataset, concatenate_datasets
from transformers import (
    ViTForImageClassification,
    ViTFeatureExtractor,
    Trainer,
    TrainingArguments,
)
from transformers.trainer_utils import EvalPrediction
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

Prepare dataset splits.

In [None]:
dataset = load_dataset("imagefolder", data_dir=colab_data_path)
splits = dataset["train"].train_test_split(test_size=0.33)
dataset["train"] = splits["train"]
dataset["val"] = splits["test"]

Define image augmentations.

In [6]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
    transforms.RandomRotation(30),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.8, 1.2)),
])

Apply the transforms to the train set. Concatenate the transformed and original train set.

In [7]:
transformed_train_dataset = dataset["train"].map(
    lambda example: {"image": transform(example["image"]), "label": example["label"]}
)

dataset["train"] = concatenate_datasets([transformed_train_dataset, dataset["train"]])

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Free unused variables.

In [15]:
del splits
del transformed_train_dataset

Configure the Feature Extractor and apply it to the dataset.

In [None]:
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)

def transform(example_batch):
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')
    inputs['labels'] = example_batch['label']
    return inputs

dataset = dataset.with_transform(transform)

Configure Vision Transformer model.

In [None]:
from transformers import ViTForImageClassification, ViTConfig

labels = dataset['train'].features['label'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
)

model.config.hidden_dropout_prob = 0.5

Define auxiliary functions for the training procedure and configure training parameters.

In [10]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

def compute_metrics(eval_pred: EvalPrediction):
  preds = np.argmax(eval_pred.predictions, axis=1)
  return {
    "acc": accuracy_score(eval_pred.label_ids, preds),
    "f1": f1_score(eval_pred.label_ids, preds, average="weighted"),
    "precision": precision_score(eval_pred.label_ids, preds, average="weighted"),
    "recall": recall_score(eval_pred.label_ids, preds, average="weighted")
    }

In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="/content/drive/MyDrive/Seminar2/model",
  per_device_train_batch_size=32,
  per_device_eval_batch_size=32,
  evaluation_strategy="steps",
  num_train_epochs=6,
  fp16=True,
  save_steps=60,
  eval_steps=60,
  warmup_steps=500,
  logging_steps=60,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
  weight_decay=0.01,
)

In [12]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    tokenizer=feature_extractor,
)

In [13]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()



Step,Training Loss,Validation Loss,Acc,F1,Precision,Recall
60,0.9455,0.584896,0.775,0.701885,0.850066,0.775
120,0.371,0.258029,0.906818,0.908178,0.93094,0.906818
180,0.1695,0.105633,0.968182,0.968471,0.971067,0.968182
240,0.0812,0.082522,0.977273,0.977078,0.977998,0.977273
300,0.0631,0.094085,0.972727,0.97292,0.973454,0.972727


***** train metrics *****
  epoch                    =         6.0
  total_flos               = 772515024GF
  train_loss               =      0.2994
  train_runtime            =  0:07:01.75
  train_samples_per_second =       25.38
  train_steps_per_second   =       0.797


In [14]:
metrics = trainer.evaluate(dataset['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        6.0
  eval_acc                =     0.5007
  eval_f1                 =     0.5086
  eval_loss               =      1.451
  eval_precision          =     0.6585
  eval_recall             =     0.5007
  eval_runtime            = 0:06:45.48
  eval_samples_per_second =      3.537
  eval_steps_per_second   =      0.111
