Tuesday, November 7, 2023


This notebook was created from the code provided [here](
https://huggingface.co/docs/peft/task_guides/image_classification_lora)

It's kinda surprising there is no notebook for this example, and we have to create it here!

Most of this works ... just some stuff at the end is not yet working ... meh for now!

In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
# only target the 4090 ...
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

Install Dependencies

In [2]:
import transformers
import accelerate
import peft

print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"PEFT version: {peft.__version__}")
"Transformers version: 4.27.4"
"Accelerate version: 0.18.0"
"PEFT version: 0.2.0"

Transformers version: 4.35.0.dev0
Accelerate version: 0.24.0
PEFT version: 0.6.0


'PEFT version: 0.2.0'

Select a model checkpoint to fine-tune

In [3]:
model_checkpoint = "google/vit-base-patch16-224-in21k"

Load the image processor of the model we are fine-tuning

In [4]:
from transformers import AutoImageProcessor

image_processor = AutoImageProcessor.from_pretrained(model_checkpoint)

Load the training dataset

In [5]:
from datasets import load_dataset

dataset = load_dataset("food101", split="train[:5000]")

# Download and split ..
# 72m 0.3s

Dataset preparation

In [6]:
labels = dataset.features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

id2label[2]
"baklava"

'baklava'

In [7]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
train_transforms = Compose(
    [
        RandomResizedCrop(image_processor.size["height"]),
        RandomHorizontalFlip(),
        ToTensor(),
        normalize,
    ]
)

val_transforms = Compose(
    [
        Resize(image_processor.size["height"]),
        CenterCrop(image_processor.size["height"]),
        ToTensor(),
        normalize,
    ]
)


def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    example_batch["pixel_values"] = [train_transforms(image.convert("RGB")) for image in example_batch["image"]]
    return example_batch


def preprocess_val(example_batch):
    """Apply val_transforms across a batch."""
    example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]]
    return example_batch

Split the dataset into training and validation sets

In [8]:
splits = dataset.train_test_split(test_size=0.1)
train_ds = splits["train"]
val_ds = splits["test"]

Finally, set the transformation functions for the dataset accordingly

In [9]:
train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

Load and prepare a model

In [10]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [11]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    model_checkpoint,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

# download and load
# 5m 9.8s 

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print_trainable_parameters(model)
"trainable params: 85876325 || all params: 85876325 || trainable%: 100.00"

trainable params: 85876325 || all params: 85876325 || trainable%: 100.00


'trainable params: 85876325 || all params: 85876325 || trainable%: 100.00'

In [13]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)
"trainable params: 667493 || all params: 86466149 || trainable%: 0.77"

trainable params: 667493 || all params: 86543818 || trainable%: 0.77


'trainable params: 667493 || all params: 86466149 || trainable%: 0.77'

Defining training arguments

In [14]:
from transformers import TrainingArguments, Trainer

model_name = model_checkpoint.split("/")[-1]
batch_size = 128

args = TrainingArguments(
    f"{model_name}-finetuned-lora-food101",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-3,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    num_train_epochs=5,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    label_names=["labels"],
)

Prepare evaluation metric

In [15]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Define collation function

In [16]:
import torch

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

Train and evaluate

In [17]:
trainer = Trainer(
    lora_model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)
# train_results = trainer.train()

CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.


In [18]:
%%time
train_results = trainer.train()

# CPU times: user 7min 27s, sys: 10.4 s, total: 7min 38s
# Wall time: 2min 20s

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.671748,0.852
2,2.190100,0.234365,0.934
3,0.361300,0.188864,0.946
4,0.202500,0.182238,0.946
5,0.178100,0.180137,0.942


CPU times: user 7min 27s, sys: 10.4 s, total: 7min 38s
Wall time: 2min 20s


In [19]:
trainer.evaluate(val_ds)
{
    "eval_loss": 0.14475855231285095,
    "eval_accuracy": 0.96,
    "eval_runtime": 3.5725,
    "eval_samples_per_second": 139.958,
    "eval_steps_per_second": 1.12,
    "epoch": 5.0,
}

{'eval_loss': 0.14475855231285095,
 'eval_accuracy': 0.96,
 'eval_runtime': 3.5725,
 'eval_samples_per_second': 139.958,
 'eval_steps_per_second': 1.12,
 'epoch': 5.0}

In [21]:
repo_name = f"robkayinto/{model_name}-finetuned-lora-food101"
lora_model.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/robkayinto/vit-base-patch16-224-in21k-finetuned-lora-food101/commit/c8c4af8adf436c2b573bdc3150813e9e2e215b9b', commit_message='Upload model', commit_description='', oid='c8c4af8adf436c2b573bdc3150813e9e2e215b9b', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
from peft import PeftConfig, PeftModel

config = PeftConfig.from_pretrained(repo_name)
model = AutoModelForImageClassification.from_pretrained(
    config.base_model_name_or_path,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)
# Load the LoRA model
inference_model = PeftModel.from_pretrained(model, repo_name)

Downloading (…)/adapter_config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading adapter_model.bin:   0%|          | 0.00/2.69M [00:00<?, ?B/s]

In [23]:
from PIL import Image
import requests

url = "https://huggingface.co/datasets/robkayinto/sample-datasets/resolve/main/beignets.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
image

UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7fed12d074f0>

In [24]:
image_processor = AutoImageProcessor.from_pretrained(repo_name)

Downloading (…)rocessor_config.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

In [25]:
encoding = image_processor(image.convert("RGB"), return_tensors="pt")

NameError: name 'image' is not defined

In [26]:
with torch.no_grad():
    outputs = inference_model(**encoding)
    logits = outputs.logits

predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", inference_model.config.id2label[predicted_class_idx])
"Predicted class: beignets"

NameError: name 'encoding' is not defined