In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets
!pip install pandas
!pip install transformers

In [None]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
    transforms.RandomRotation(30),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.8, 1.2)),
])

In [None]:
import pandas as pd
from datasets import load_dataset, Image, Dataset, DatasetDict, concatenate_datasets

data_path = "/content/drive/MyDrive/Seminar2/data/ribe_512x768"
dataset = load_dataset("imagefolder", data_dir=data_path)

splits = dataset["train"].train_test_split(test_size=0.33)
dataset["train"] = splits["train"]
dataset["val"] = splits["test"]

transformed_train_dataset = dataset["train"].map(
    lambda example: {"image": transform(example["image"]), "label": example["label"]}
)

dataset["train"] = concatenate_datasets([transformed_train_dataset, dataset["train"]])

dataset

Resolving data files:   0%|          | 0/1332 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1434 [00:00<?, ?it/s]

Downloading and preparing dataset imagefolder/default to /root/.cache/huggingface/datasets/imagefolder/default-4eb74ab91a7bc454/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...


Downloading data files:   0%|          | 0/1332 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Downloading data files:   0%|          | 0/1434 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to /root/.cache/huggingface/datasets/imagefolder/default-4eb74ab91a7bc454/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 1784
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1434
    })
    val: Dataset({
        features: ['image', 'label'],
        num_rows: 440
    })
})

In [None]:
from transformers import ViTFeatureExtractor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)
feature_extractor

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



ViTFeatureExtractor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTFeatureExtractor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [None]:
def process_example(example):
    inputs = feature_extractor(example['image'], return_tensors='pt')
    inputs['labels'] = example['label']
    return inputs

process_example(dataset['train'][0])

{'pixel_values': tensor([[[[ 0.1216,  0.1294,  0.1137,  ...,  0.1137,  0.1059,  0.1059],
          [ 0.1216,  0.1216,  0.1373,  ...,  0.1059,  0.1059,  0.1059],
          [ 0.1294,  0.1216,  0.1294,  ...,  0.1059,  0.1059,  0.1059],
          ...,
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]],

         [[ 0.1137,  0.1294,  0.1059,  ...,  0.0902,  0.0902,  0.0824],
          [ 0.1137,  0.1059,  0.1216,  ...,  0.0980,  0.0902,  0.0902],
          [ 0.1216,  0.1137,  0.1294,  ...,  0.0902,  0.0902,  0.0902],
          ...,
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]],

         [[-0.0902, -0.0902, -0.0902,  ..., -0.0667, -0.0824, -0.0745],
          [-0

In [None]:
def transform(example_batch):
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')
    inputs['labels'] = example_batch['label']
    return inputs

dataset = dataset.with_transform(transform)

dataset['train'][0:2]

{'pixel_values': tensor([[[[ 0.1216,  0.1294,  0.1137,  ...,  0.1137,  0.1059,  0.1059],
          [ 0.1216,  0.1216,  0.1373,  ...,  0.1059,  0.1059,  0.1059],
          [ 0.1294,  0.1216,  0.1294,  ...,  0.1059,  0.1059,  0.1059],
          ...,
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]],

         [[ 0.1137,  0.1294,  0.1059,  ...,  0.0902,  0.0902,  0.0824],
          [ 0.1137,  0.1059,  0.1216,  ...,  0.0980,  0.0902,  0.0902],
          [ 0.1216,  0.1137,  0.1294,  ...,  0.0902,  0.0902,  0.0902],
          ...,
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]],

         [[-0.0902, -0.0902, -0.0902,  ..., -0.0667, -0.0824, -0.0745],
          [-0

In [None]:
import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [None]:
from transformers.trainer_utils import EvalPrediction
import numpy as np
from datasets import load_metric
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

def compute_metrics(eval_pred: EvalPrediction):
  preds = np.argmax(eval_pred.predictions, axis=1)
  return {
    "acc": accuracy_score(eval_pred.label_ids, preds),
    "f1": f1_score(eval_pred.label_ids, preds, average="weighted"),
    "precision": precision_score(eval_pred.label_ids, preds, average="weighted"),
    "recall": recall_score(eval_pred.label_ids, preds, average="weighted")
    }

In [None]:
from transformers import ViTForImageClassification, ViTConfig

labels = dataset['train'].features['label'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
)

model.config.hidden_dropout_prob = 0.3

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="/content/drive/MyDrive/Seminar2/model",
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=6,
  fp16=True,
  save_steps=60,
  eval_steps=60,
  warmup_steps=500,
  logging_steps=60,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
  weight_decay=0.1,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    tokenizer=feature_extractor,
)

In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Step,Training Loss,Validation Loss,Acc,F1,Precision,Recall
60,1.036,0.853011,0.765909,0.676731,0.617119,0.765909
120,0.6656,0.398192,0.875,0.872503,0.872161,0.875
180,0.4445,0.318145,0.852273,0.829341,0.891466,0.852273
240,0.3347,0.15562,0.961364,0.960709,0.962355,0.961364
300,0.2717,0.3074,0.884091,0.871446,0.909681,0.884091
360,0.3465,0.149119,0.954545,0.955743,0.96097,0.954545
420,0.352,0.284853,0.888636,0.892538,0.922824,0.888636
480,0.2835,0.271237,0.927273,0.928741,0.944904,0.927273
540,0.3146,0.105811,0.968182,0.968182,0.968182,0.968182
600,0.2072,0.229536,0.893182,0.895342,0.923991,0.893182


  _warn_prf(average, modifier, msg_start, len(result))


***** train metrics *****
  epoch                    =         6.0
  total_flos               = 772515024GF
  train_loss               =      0.2258
  train_runtime            =  0:31:50.98
  train_samples_per_second =       5.601
  train_steps_per_second   =         0.7


In [None]:
metrics = trainer.evaluate(dataset['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        6.0
  eval_acc                =     0.5202
  eval_f1                 =      0.522
  eval_loss               =     2.2759
  eval_precision          =     0.6287
  eval_recall             =     0.5202
  eval_runtime            = 0:05:39.08
  eval_samples_per_second =      4.229
  eval_steps_per_second   =      0.531
