In [11]:
model_checkpoint = "microsoft/dit-base-finetuned-rvlcdip" # pre-trained model from which to fine-tune
batch_size = 8 # batch size for training and evaluation

In [12]:
!pip install -q datasets transformers

In [13]:
import os
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from datasets import load_metric
from transformers import AutoFeatureExtractor
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)

loading feature extractor configuration file https://huggingface.co/microsoft/dit-base-finetuned-rvlcdip/resolve/main/preprocessor_config.json from cache at /root/.cache/huggingface/transformers/2f83561e7184c570aa927a7972ff46fdfff24eeff5b40333a293c6cfffd4229f.7b08f544be969a001f15135755ab3b4dcc0a17d87797d9caf511edde58666ff7
Feature extractor BeitFeatureExtractor {
  "crop_size": 224,
  "do_center_crop": false,
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "BeitFeatureExtractor",
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "reduce_labels": false,
  "resample": 2,
  "size": 224
}



In [14]:
metric = load_metric("accuracy")
label2id = {"bad": 0, "good": 1}
id2label = {0: "bad", 1: "good"}
model = AutoModelForImageClassification.from_pretrained(
    model_checkpoint, 
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

loading configuration file https://huggingface.co/microsoft/dit-base-finetuned-rvlcdip/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/51c751d4f3ab3a23fe10dd045e461c9a63dd167ad5f2fc917abf93e814b7f475.79a5769161edf018126472bf9810c4ff7bc3ff456d85fe4555aedc56836fe27e
Model config BeitConfig {
  "_name_or_path": "microsoft/dit-base-finetuned-rvlcdip",
  "architectures": [
    "BeitForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "auxiliary_channels": 256,
  "auxiliary_concat_input": false,
  "auxiliary_loss_weight": 0.4,
  "auxiliary_num_convs": 1,
  "drop_path_rate": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "bad",
    "1": "good"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "bad": 0,
    "good": 1
  },
  "layer_norm_eps": 1e-12,
  "layer_scale_init_value": 0.1,
  "model_type": "beit",
  "num_attention_heads": 12,
  "n

In [15]:
metric = load_metric("accuracy")
label2id = {"bad": 0, "good": 1}
id2label = {0: "bad", 1: "good"}
model = AutoModelForImageClassification.from_pretrained(
    model_checkpoint, 
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

loading configuration file https://huggingface.co/microsoft/dit-base-finetuned-rvlcdip/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/51c751d4f3ab3a23fe10dd045e461c9a63dd167ad5f2fc917abf93e814b7f475.79a5769161edf018126472bf9810c4ff7bc3ff456d85fe4555aedc56836fe27e
Model config BeitConfig {
  "_name_or_path": "microsoft/dit-base-finetuned-rvlcdip",
  "architectures": [
    "BeitForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "auxiliary_channels": 256,
  "auxiliary_concat_input": false,
  "auxiliary_loss_weight": 0.4,
  "auxiliary_num_convs": 1,
  "drop_path_rate": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "bad",
    "1": "good"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "bad": 0,
    "good": 1
  },
  "layer_norm_eps": 1e-12,
  "layer_scale_init_value": 0.1,
  "model_type": "beit",
  "num_attention_heads": 12,
  "n

In [6]:
df = pd.read_csv(r"/content/image_quality_data/train_folds_aug.csv")
# "/content/drive/MyDrive/image_quality/train_folder/train_folds_aug.csv"
# /content/image_quality_data/
df.head()

Unnamed: 0,file_name,target,kfold
0,bad_32_3.jpg,bad,0
1,bad_12_0.jpg,bad,0
2,good_2_0.jpg,good,0
3,good_57_1.jpg,good,0
4,bad_59_3.jpg,bad,0


In [16]:
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
train_transforms = Compose(
        [
            RandomResizedCrop(feature_extractor.size),
            # Resize(feature_extractor.size),
            # RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

val_transforms = Compose(
        [
            Resize(feature_extractor.size),
            CenterCrop(feature_extractor.size),
            ToTensor(),
            normalize,
        ]
    )

def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["image"]
    ]
    return example_batch

def preprocess_val(example_batch):
    """Apply val_transforms across a batch."""
    example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]]
    return example_batch

In [17]:
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [19]:
model_name = model_checkpoint.split("/")[-1]
def get_args(fold):
  args_fold = TrainingArguments(
      f"{model_name}-finetuned-image_quality-{fold}",
      remove_unused_columns=False,
      evaluation_strategy = "epoch",
      save_strategy = "epoch",
      save_total_limit = 2,
      learning_rate=1e-5,
      per_device_train_batch_size=batch_size,
      gradient_accumulation_steps=4,
      per_device_eval_batch_size=batch_size,
      num_train_epochs=20,
      warmup_ratio=0.1,
      logging_steps=10,
      load_best_model_at_end=True,
      metric_for_best_model="accuracy",
      push_to_hub=False,
  )
  return args_fold

In [None]:
for fold in range(2):
  print("Fold-------------------------{}-------------------".format(fold))
  train_df = df[df["kfold"] != fold].reset_index()
  val_df = df[df["kfold"] == fold].reset_index()
  args = get_args(fold)
  image_folder_path = "/content/image_quality_data"
  train_img_list = [os.path.join(image_folder_path, train_df["target"][j], train_df["file_name"][j]) for j in range(len(train_df["file_name"].values))]
  test_img_list = [os.path.join(image_folder_path, val_df["target"][j], val_df["file_name"][j]) for j in range(len(val_df["file_name"].values))]
  dataset = load_dataset("imagefolder", data_files={"train": train_img_list, "test": test_img_list})

  train_ds = dataset['train']
  val_ds = dataset['test']

  train_ds.set_transform(preprocess_train)
  val_ds.set_transform(preprocess_val)
  trainer = Trainer(
      model,
      args,
      train_dataset=train_ds,
      eval_dataset=val_ds,
      tokenizer=feature_extractor,
      compute_metrics=compute_metrics,
      data_collator=collate_fn,
  )
  train_results = trainer.train()
  # rest is optional but nice to have
  model_dir = "/content/image_quality_data/model_"+str(fold)
  try:
    os.mkdir(model_dir)
  except:
    continue
  trainer.save_model(output_dir = model_dir)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Fold-------------------------0-------------------


Resolving data files:   0%|          | 0/624 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/156 [00:00<?, ?it/s]

Using custom data configuration default-bf63d48d8b91a526
Reusing dataset imagefolder (/root/.cache/huggingface/datasets/imagefolder/default-bf63d48d8b91a526/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597)


  0%|          | 0/2 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 624
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 380


Epoch,Training Loss,Validation Loss,Accuracy
0,0.6923,0.688715,0.525641
1,0.6735,0.657269,0.602564


***** Running Evaluation *****
  Num examples = 156
  Batch size = 8
Saving model checkpoint to dit-base-finetuned-rvlcdip-finetuned-image_quality-0/checkpoint-19
Configuration saved in dit-base-finetuned-rvlcdip-finetuned-image_quality-0/checkpoint-19/config.json
Model weights saved in dit-base-finetuned-rvlcdip-finetuned-image_quality-0/checkpoint-19/pytorch_model.bin
Feature extractor saved in dit-base-finetuned-rvlcdip-finetuned-image_quality-0/checkpoint-19/preprocessor_config.json
Deleting older checkpoint [dit-base-finetuned-rvlcdip-finetuned-image_quality-0/checkpoint-171] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 156
  Batch size = 8
Saving model checkpoint to dit-base-finetuned-rvlcdip-finetuned-image_quality-0/checkpoint-38
Configuration saved in dit-base-finetuned-rvlcdip-finetuned-image_quality-0/checkpoint-38/config.json
Model weights saved in dit-base-finetuned-rvlcdip-finetuned-image_quality-0/checkpoint-38/pytorch_model.bin
Feature ex

In [23]:
import os

In [24]:
os.getcwd()

'/content'

In [25]:
!nvidia-smi

Wed Jul 20 07:01:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    33W /  70W |   4824MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [26]:
import transformers
import torch

In [27]:
transformers.__version__


'4.20.1'

In [28]:
torch.__version__

'1.12.0+cu113'

In [63]:
import shutil
shutil.rmtree("/content/dit-base-finetuned-rvlcdip-finetuned-image_quality-0")