Environment setup.

In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
colab_data_path = "/content/drive/MyDrive/Seminar2/data/ribe_512x768/"
colab_dir = "/content/drive/MyDrive/Seminar2/model/"
model_name_or_path = "google/vit-base-patch16-224-in21k"

Imports.

In [8]:
import numpy as np
import torch
from torchvision import transforms
from datasets import load_dataset, concatenate_datasets
from transformers import (
    ViTForImageClassification,
    ViTFeatureExtractor,
    Trainer,
    TrainingArguments,
)
from transformers.trainer_utils import EvalPrediction
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

Prepare dataset splits.

In [9]:
dataset = load_dataset("imagefolder", data_dir=colab_data_path)
splits = dataset["train"].train_test_split(test_size=0.33)
dataset["train"] = splits["train"]
dataset["val"] = splits["test"]

Resolving data files:   0%|          | 0/1332 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1434 [00:00<?, ?it/s]

Downloading and preparing dataset imagefolder/default to /root/.cache/huggingface/datasets/imagefolder/default-6fb45572b7bba4d8/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...


Downloading data files:   0%|          | 0/1332 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Downloading data files:   0%|          | 0/1434 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to /root/.cache/huggingface/datasets/imagefolder/default-6fb45572b7bba4d8/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Define image augmentations.

In [10]:
transform = transforms.Compose([
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
    transforms.RandomRotation(30),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.8, 1.2)),
])

Apply the transforms to the train set. Concatenate the transformed and original train set.

In [11]:
transformed_train_dataset = dataset["train"].map(
    lambda example: {"image": transform(example["image"]), "label": example["label"]}
)

dataset["train"] = concatenate_datasets([transformed_train_dataset, dataset["train"]])

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Configure the Feature Extractor and apply it to the dataset.

In [12]:
feature_extractor = ViTFeatureExtractor.from_pretrained(
    model_name_or_path,
    do_resize=False,
    patch_size=64,
)

feature_extractor.image_mean = [0.485, 0.456, 0.406]
feature_extractor.image_std = [0.229, 0.224, 0.225]


def transform(example_batch):
    inputs = feature_extractor(
        [x.convert("RGB") for x in example_batch["image"]], return_tensors="pt"
      )
    inputs["labels"] = example_batch["label"]
    return inputs

dataset = dataset.with_transform(transform)

id2label = {id: label for id, label in enumerate(dataset["train"].features["label"].names)}
label2id = {label: id for id, label in id2label.items()}

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



Configure Vision Transformer model.

In [13]:
# Set interpolate_pos_encoding=True in the source code.
model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

model.config.hidden_dropout_prob = 0.5

Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define auxiliary functions for the training procedure and configure training parameters.

In [14]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

def compute_metrics(eval_pred: EvalPrediction):
  preds = np.argmax(eval_pred.predictions, axis=1)
  return {
    "acc": accuracy_score(eval_pred.label_ids, preds),
    "f1": f1_score(eval_pred.label_ids, preds, average="weighted"),
    "precision": precision_score(eval_pred.label_ids, preds, average="weighted"),
    "recall": recall_score(eval_pred.label_ids, preds, average="weighted")
    }

In [15]:
training_args = TrainingArguments(
  output_dir=colab_dir,
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=20,
  fp16=True,
  save_steps=60,
  eval_steps=60,
  warmup_steps=500,
  logging_steps=60,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
  weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    tokenizer=feature_extractor,
)

Train the model.

In [16]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()



Step,Training Loss,Validation Loss,Acc,F1,Precision,Recall
60,0.9391,0.701933,0.752273,0.661351,0.599782,0.752273
120,0.5239,0.457691,0.802273,0.799802,0.887009,0.802273
180,0.3432,0.162899,0.956818,0.957097,0.958143,0.956818
240,0.2952,0.378483,0.913636,0.913146,0.912774,0.913636
300,0.2747,0.352719,0.868182,0.853599,0.900441,0.868182
360,0.2272,0.226261,0.886364,0.876498,0.904998,0.886364
420,0.3389,0.191211,0.915909,0.913552,0.919351,0.915909
480,0.2788,0.5181,0.802273,0.751164,0.865703,0.802273
540,0.2196,0.155198,0.936364,0.937792,0.94838,0.936364
600,0.1724,0.111165,0.965909,0.965636,0.966187,0.965909


  _warn_prf(average, modifier, msg_start, len(result))


***** train metrics *****
  epoch                    =          20.0
  total_flos               = 20179984305GF
  train_loss               =        0.0693
  train_runtime            =    1:00:18.24
  train_samples_per_second =         9.861
  train_steps_per_second   =         1.233


Evaluate the model.

In [17]:
metrics = trainer.evaluate(dataset["test"])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =       20.0
  eval_acc                =     0.5621
  eval_f1                 =     0.5706
  eval_loss               =      1.891
  eval_precision          =     0.7395
  eval_recall             =     0.5621
  eval_runtime            = 0:05:17.64
  eval_samples_per_second =      4.514
  eval_steps_per_second   =      0.567
