In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, http

Imports

In [2]:
import numpy as np
import torch
from datasets import load_dataset, load_metric
from transformers import (
    ViTForImageClassification,
    ViTFeatureExtractor,
    Trainer,
    TrainingArguments,
)

### train_vit_clf.py

In [15]:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["labels"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}


def create_dataloaders_and_mappings(data_path):
    dataset = load_dataset("imagefolder", data_dir=data_path)

    splits = dataset["train"].train_test_split(test_size=0.1)
    dataset["train"] = splits["train"]
    dataset["val"] = splits["test"]

    id2label = {
        id: label for id, label in enumerate(dataset["train"].features["label"].names)
    }

    label2id = {label: id for id, label in id2label.items()}

    return dataset, id2label, label2id


def compute_metrics(eval_pred):
    metric1 = load_metric("accuracy")
    metric2 = load_metric("precision")
    metric3 = load_metric("recall")
    metric4 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
    precision = metric2.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric3.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric4.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


def main():
    from google.colab import drive
    drive.mount('/content/drive')
    colab_data_path = "/content/drive/MyDrive/Seminar2/data/ribe_512x768"
    colab_dir = "/content/drive/MyDrive/Seminar2/model"
    model_id = "google/vit-base-patch16-224"

    dataset, id2label, label2id = create_dataloaders_and_mappings(colab_data_path)

    feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)

    def transform(example_batch):
        inputs = feature_extractor(
            [x.convert("RGB") for x in example_batch["image"]], return_tensors="pt"
        )
        inputs["labels"] = example_batch["label"]
        return inputs

    dataset = dataset.with_transform(transform)

    model = ViTForImageClassification.from_pretrained(
        pretrained_model_name_or_path=model_id,
        num_labels=len(id2label),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )

    training_args = TrainingArguments(
        output_dir=colab_dir,
        per_device_train_batch_size=16,
        evaluation_strategy="steps",
        num_train_epochs=4,
        fp16=True,
        save_steps=100,
        eval_steps=100,
        logging_steps=10,
        learning_rate=2e-4,
        save_total_limit=2,
        remove_unused_columns=False,
        push_to_hub=False,
        report_to="tensorboard",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
        train_dataset=dataset["train"],
        eval_dataset=dataset["val"],
        tokenizer=feature_extractor,
    )

    train_results = trainer.train()
    trainer.save_model()
    trainer.log_metrics("train", train_results.metrics)
    trainer.save_metrics("train", train_results.metrics)
    trainer.save_state()

    metrics = trainer.evaluate(dataset["test"])
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)


### Experiment

In [16]:
main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Resolving data files:   0%|          | 0/1332 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1434 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--google--vit-base-patch16-224/snapshots/2ddc9d4e473d7ba52128f0df4723e478fa14fb80/preprocessor_config.json
size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'shortest_edge', 'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.
Image processor ViTFeatureExtractor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTFeatureExtractor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--vit-base-patch16-224/snapshots/2ddc9d4e473d7ba52128f0df4723e478fa14fb80/config.json
Model config ViTConfig {
  "_name_or_path": "google/vit

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,0.1221,0.108078,0.970149,0.972445,0.970149,0.970287
200,0.0166,0.00368,1.0,1.0,1.0,1.0
300,0.0001,0.001389,1.0,1.0,1.0,1.0


***** Running Evaluation *****
  Num examples = 134
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/Seminar2/model/checkpoint-100
Configuration saved in /content/drive/MyDrive/Seminar2/model/checkpoint-100/config.json
Model weights saved in /content/drive/MyDrive/Seminar2/model/checkpoint-100/pytorch_model.bin
Image processor saved in /content/drive/MyDrive/Seminar2/model/checkpoint-100/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 134
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/Seminar2/model/checkpoint-200
Configuration saved in /content/drive/MyDrive/Seminar2/model/checkpoint-200/config.json
Model weights saved in /content/drive/MyDrive/Seminar2/model/checkpoint-200/pytorch_model.bin
Image processor saved in /content/drive/MyDrive/Seminar2/model/checkpoint-200/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 134
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/Seminar2/mode

***** train metrics *****
  epoch                    =         4.0
  total_flos               = 345841927GF
  train_loss               =      0.1202
  train_runtime            =  0:03:23.15
  train_samples_per_second =      23.587
  train_steps_per_second   =       1.477


***** eval metrics *****
  epoch                   =        4.0
  eval_accuracy           =     0.4868
  eval_f1                 =     0.5001
  eval_loss               =     1.6194
  eval_precision          =     0.6431
  eval_recall             =     0.4868
  eval_runtime            = 0:10:45.31
  eval_samples_per_second =      2.222
  eval_steps_per_second   =      0.279
