From https://huggingface.co/docs/transformers/tasks/image_classification

In [1]:
from datasets import load_dataset 
from transformers import AutoImageProcessor
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
from transformers import DefaultDataCollator
import evaluate
import numpy as np
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
from huggingface_hub import notebook_login
from transformers import pipeline

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
food = load_dataset("food101", split="train[:5000]")
food = food.train_test_split(test_size=0.2)

Found cached dataset food101 (/Users/skelley/.cache/huggingface/datasets/food101/default/0.0.0/7cebe41a80fb2da3f08fcbef769c8874073a86346f7fb96dc0847d4dfc318295)


In [4]:
labels = food["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [5]:
checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [6]:
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

food = food.with_transform(transforms)

In [7]:
data_collator = DefaultDataCollator()
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)
training_args = TrainingArguments(
    output_dir="my_awesome_food_model",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=food["train"],
    eval_dataset=food["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/Users/skelley/Library/CloudStor

In [8]:
trainer.train()
trainer.push_to_hub()

***** Running training *****
  Num examples = 4000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 186
  Number of trainable parameters = 85876325
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mstephenskelley[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
0,2.6646,2.486608,0.845
1,1.8188,1.729151,0.898
2,1.5637,1.553507,0.919


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to my_awesome_food_model/checkpoint-62
Configuration saved in my_awesome_food_model/checkpoint-62/config.json
Model weights saved in my_awesome_food_model/checkpoint-62/pytorch_model.bin
Image processor saved in my_awesome_food_model/checkpoint-62/preprocessor_config.json
Image processor saved in my_awesome_food_model/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to my_awesome_food_model/checkpoint-124
Configuration saved in my_awesome_food_model/checkpoint-124/config.json
Model weights saved in my_awesome_food_model/checkpoint-124/pytorch_model.bin
Image processor saved in my_awesome_food_model/checkpoint-124/preprocessor_config.json
Image processor saved in my_awesome_food_model/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to my_awesome_food_mo

Upload file pytorch_model.bin:   0%|          | 32.0k/328M [00:00<?, ?B/s]

Upload file runs/Feb17_10-02-56_TFGLESOS5000843/events.out.tfevents.1676649780.TFGLESOS5000843.78030.0: 100%|#…

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/StephenSKelley/my_awesome_food_model
   633a7fe..6e60bcc  main -> main

To https://huggingface.co/StephenSKelley/my_awesome_food_model
   6e60bcc..ec7cc12  main -> main



'https://huggingface.co/StephenSKelley/my_awesome_food_model/commit/6e60bccc8822293f6f3410bfd545ce4de6db5cac'

In [9]:
classifier = pipeline("image-classification", model="StephenSKelley/my_awesome_food_model")

loading configuration file config.json from cache at /Users/skelley/.cache/huggingface/hub/models--StephenSKelley--my_awesome_food_model/snapshots/ec7cc12a0b6d55c79cb996cf430841739ed44c17/config.json
Model config ViTConfig {
  "_name_or_path": "StephenSKelley/my_awesome_food_model",
  "architectures": [
    "ViTForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "apple_pie",
    "1": "baby_back_ribs",
    "2": "baklava",
    "3": "beef_carpaccio",
    "4": "beef_tartare",
    "5": "beet_salad",
    "6": "beignets",
    "7": "bibimbap",
    "8": "bread_pudding",
    "9": "breakfast_burrito",
    "10": "bruschetta",
    "11": "caesar_salad",
    "12": "cannoli",
    "13": "caprese_salad",
    "14": "carrot_cake",
    "15": "ceviche",
    "16": "cheesecake",
    "17": "cheese_plate",
    "18": "chicken_curry",
    "19": "chicken_quesadilla",
    "20"

Downloading:   0%|          | 0.00/344M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /Users/skelley/.cache/huggingface/hub/models--StephenSKelley--my_awesome_food_model/snapshots/ec7cc12a0b6d55c79cb996cf430841739ed44c17/pytorch_model.bin
All model checkpoint weights were used when initializing ViTForImageClassification.

All the weights of ViTForImageClassification were initialized from the model checkpoint at StephenSKelley/my_awesome_food_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ViTForImageClassification for predictions without further training.
loading configuration file preprocessor_config.json from cache at /Users/skelley/.cache/huggingface/hub/models--StephenSKelley--my_awesome_food_model/snapshots/ec7cc12a0b6d55c79cb996cf430841739ed44c17/preprocessor_config.json
loading configuration file config.json from cache at /Users/skelley/.cache/huggingface/hub/models--StephenSKelley--my_awesome_food_model/snapshots/ec7cc12a0b6d55c79cb996cf430841739ed44c1

In [10]:
ds = load_dataset("food101", split="train")[1500:1600]
predictions = classifier(ds['image'])
for i in range(len(predictions)):
    #display(ds['image'][i])
    print(id2label[str(ds['label'][i])], predictions[i][np.argmax([x['score'] for x in predictions[i]])])

Found cached dataset food101 (/Users/skelley/.cache/huggingface/datasets/food101/default/0.0.0/7cebe41a80fb2da3f08fcbef769c8874073a86346f7fb96dc0847d4dfc318295)
Disabling tokenizer parallelism, we're using DataLoader multithreading already


ramen {'score': 0.30178412795066833, 'label': 'ramen'}
ramen {'score': 0.26783517003059387, 'label': 'ramen'}
ramen {'score': 0.15125738084316254, 'label': 'bruschetta'}
ramen {'score': 0.29982098937034607, 'label': 'ramen'}
ramen {'score': 0.2653557062149048, 'label': 'ramen'}
ramen {'score': 0.2995688021183014, 'label': 'ramen'}
ramen {'score': 0.2448892891407013, 'label': 'ramen'}
ramen {'score': 0.2860872745513916, 'label': 'ramen'}
ramen {'score': 0.28748175501823425, 'label': 'ramen'}
ramen {'score': 0.29536956548690796, 'label': 'ramen'}
ramen {'score': 0.3025054335594177, 'label': 'ramen'}
ramen {'score': 0.256673127412796, 'label': 'ramen'}
ramen {'score': 0.2904338240623474, 'label': 'ramen'}
ramen {'score': 0.30083608627319336, 'label': 'ramen'}
ramen {'score': 0.28116124868392944, 'label': 'ramen'}
ramen {'score': 0.29854515194892883, 'label': 'ramen'}
ramen {'score': 0.29309603571891785, 'label': 'ramen'}
ramen {'score': 0.19301405549049377, 'label': 'ramen'}
ramen {'score