# Install Packages


In [None]:
! pip install datasets transformers transformers[torch] accelerate==0.21.0

In [None]:
!pip install --upgrade transformers
!pip install --upgrade accelerate

# Import Dataset
We are going to fine tune our model using "gjuggler/bird-data" which is a dataset found on Huggingface.
1. import the dataset
2. create train and test sets

In [None]:
# 1. import the dataset

import datasets
from datasets import load_dataset

# 2. Create Training Set
dataset_train = load_dataset(
    "gjuggler/bird-data",
    split='train',
    ignore_verifications=False
)



Downloading builder script:   0%|          | 0.00/9.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.87G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.04G [00:00<?, ?B/s]

Computing checksums:  50%|#####     | 1/2 [00:30<00:30, 30.55s/it]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/23912 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/24615 [00:00<?, ? examples/s]

In [None]:
# 2. Create Testing Set
dataset_test = load_dataset(
    "gjuggler/bird-data",
    split='test',
    ignore_verifications=False
)



### Subset Data To Test Processes

We will create subsets of these to use during the model building and training process to make sure everything runs smoothly.

In [None]:
# 3. subset our data to test out the model and training function
  # we will extract the first 9999 rows
train_subset = dataset_train.select(range(0,9999,1))
test_subset = dataset_test.select(range(0,9999,1))
dataset_training = train_subset
dataset_test = test_subset
print(dataset_training)
print(dataset_test)

Dataset({
    features: ['image_file_path', 'image', 'labels'],
    num_rows: 9999
})
Dataset({
    features: ['image_file_path', 'image', 'labels'],
    num_rows: 9999
})


# Pull an example image from our dataset

In [None]:
example = dataset_training[400]
example

{'image_file_path': '/root/.cache/huggingface/datasets/downloads/extracted/0308feaebff84764ee0bc41fb3987ef39a113637d312df1e0a05174ef40ea8f0/American Goldfinch /0823aa7a15e94ad7b25164274efb7acc.jpg',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x872>,
 'labels': 8}

Since `labels` feature of this dataset is a `datasets.features.ClassLabel`, we can use it to look up the corresponding name for this example's label ID. First let's access the feature definition for the `labels`.

In [None]:
labels = dataset_training.features['labels']
labels

Now let's print out the class label for our example. We can do that by using the `int2str` function of `ClassLabel` which as the name implies, allow to pass the integer representation of the class to look uo the string label.

In [None]:
labels.int2str(example['labels'])

'American Goldfinch'

# Loading the Image Processor

Now that we've gained insights into the appearance of our images and have a clearer understanding of the problem at hand, let's delve into the process of preparing these images for integration into our model. During the training of Vit models, specific transformations are implemented on the input images. Applying inappropriate transformations can lead to the model misinterpreting the visual data. To ensure accurate transformations, we'll utilize the `ViTImageProcessor`, which is initialized with a configuration saved alongside the pretrained model we intend to employ. For our current scenario, we'll opt for the `'google/vit-base-patch16-224'` model and proceed by loading its feature extractor from the Hugging Face Hub.

In [None]:
from transformers import ViTImageProcessor

model_name_or_path = 'google/vit-base-patch16-224'
image_processor = ViTImageProcessor.from_pretrained(model_name_or_path, convert_to_rgb=True)


Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [None]:
print(image_processor)

Next, we'll proceed with the image processing step by submitting the image through the image processor's call function. This action will yield a dictionary comprising `pixel_values`, which signifies the numerical representation destined for the model input. To obtain the results as torch tensors, simply include the `return_tensors='pt'` argument.

In [None]:
# Observe image tensor after it passes through image_processor
image = example['image']
image_processor(image, return_tensors='pt')

In [None]:
# Process an example image by using a function

def process_example(example):
  inputs = image_processor(example['image'], return_tensors='pt')
  inputs['labels'] = example['labels']
  return inputs

process_example(dataset_training[0])

Revise the final function to accommodate a batch of data, enabling us to utilize `.with_transform` to apply transformations to our complete dataset.

In [None]:
import os
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset

In [None]:
def transform(example_batch):
    # list of PIL images to pixel values
    inputs = image_processor([x for x in example_batch['image']], return_tensors='pt')

    # Including the labels
    inputs['labels'] = example_batch['labels']
    return inputs


In [None]:
# Create variables that point to the prepared traingin and testing sets.
  # When we say prepared we mean transformed and ready to be used for training.

prepared_train = dataset_training.with_transform(transform)
prepared_test = dataset_test.with_transform(transform)

In [None]:
prepared_test[400]

{'pixel_values': tensor([[[-0.4118, -0.3569, -0.2941,  ..., -0.0667, -0.0745, -0.1059],
          [-0.3804, -0.3333, -0.1294,  ..., -0.0745, -0.1216, -0.2157],
          [-0.3412, -0.2157,  0.2157,  ...,  0.0431, -0.0980, -0.1843],
          ...,
          [-0.1765, -0.1373, -0.1529,  ...,  0.6471,  0.7961,  0.7804],
          [-0.1608, -0.1529, -0.2471,  ...,  0.2471,  0.6863,  0.7882],
          [-0.1686, -0.1686, -0.2706,  ..., -0.0510,  0.4039,  0.6314]],
 
         [[-0.4824, -0.4275, -0.3647,  ..., -0.0431, -0.0510, -0.0824],
          [-0.4667, -0.4118, -0.1922,  ..., -0.0588, -0.0980, -0.1922],
          [-0.4431, -0.2941,  0.1686,  ...,  0.0588, -0.0824, -0.1608],
          ...,
          [-0.1294, -0.0824, -0.0980,  ...,  0.6314,  0.7961,  0.8039],
          [-0.1137, -0.0980, -0.1843,  ...,  0.2392,  0.6941,  0.8196],
          [-0.1137, -0.1059, -0.2078,  ..., -0.0510,  0.4118,  0.6627]],
 
         [[-0.5608, -0.5294, -0.5686,  ..., -0.1765, -0.2000, -0.2471],
          [-

This time, the resulting `pixel_values` tensor will have shape (2, 3, 224, 224)

# Set up Push to Hub API


In [None]:
! pip install git-lfs
! huggingface-cli login

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Training and Evaluation

Before we can effectively utilize the Huggingface trainer, there are several crucial tasks to address:

1. **Define a Collate Function:** This function's purpose is to organize and assemble the data appropriately.
2. **Establish an Evaluation Metric:** For training, the model's predictive accuracy needs to be evaluated. It's essential to create a `compute_metrics` function aligned with this requirement.
4. **Load the Base Model:** Load the base model that we will build upon via fine-tuning.
3. **Set Up Training Configuration:** Define the training configuration to guide the model's fine-tuning process.

Upon successfully fine-tuning the model, the next step involves evaluating its performance on the evaluation dataset. This process will confirm whether the model has successfully learned to accurately classify the images.


## 1. Define the Collator

Given that batches arrive in the form of lists of dictionaries, our approach involves unpacking and stacking these into batch tensors. Since the `collate_fn` returns a batch dictionary, we can conveniently use `**unpacking` for input to the model at a later stage.

In [None]:
# 1. Define a Collate Function

import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }


## 2. Define an Evaluation Metric

The accuracy metric from the datasets library can be seamlessly employed to compare predictions against labels. Below, you'll find an example of how to incorporate it into a `compute_metrics` function, which will subsequently be utilized by the Trainer.

In [None]:
# 2. Establish an Evaluation Metric

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)


## 3. Load the Base Model

Now, we'll proceed with loading the pretrained model. During initialization, we'll specify num_labels to ensure the model generates a classification head with the correct number of units. Additionally, we'll incorporate the `id2label` and `label2id` mappings. These mappings serve to provide human-readable labels within the Hub widget, offering enhanced clarity if you decide to push the model to the Hub.

In [None]:
# 3. Load the Base Model

from transformers import ViTForImageClassification

labels = dataset_training.features['labels'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    ignore_mismatched_sizes=True
)


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([405, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([405]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Set Up Training and Saving Configuration

Define the training configuration to guide the model's fine-tuning process. This includes the training arguments to accommodate the trainer.

In [None]:
# Training Arguments

from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir = "https://huggingface.co/spaces/schampoux/avg_models",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  save_strategy = "steps",
  num_train_epochs=1,
  fp16=True,
  save_steps=50,
  eval_steps=50,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=100, # save 100 checkpoints if we need to recover the models state in case of interruptions
  remove_unused_columns=False,
  push_to_hub=True,
  report_to='tensorboard',
  load_best_model_at_end=True,
)

Pass all instances to `trainer` defined below.

In [None]:
# Define Trainer function

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_train,
    eval_dataset=prepared_test,
    tokenizer=image_processor,
)


Push to hub criteria:

In [None]:
kwargs = {
    "finetuned_from": model.config._name_or_path,
    "tasks": "image-classification",
    "dataset": 'gjuggler/bird-data',
    "tags": ['image-classification'],
}

if training_args.push_to_hub:
    trainer.push_to_hub('🍻 cheers', **kwargs)
else:
    trainer.create_model_card(**kwargs)

# Instantiate the trainer

In [None]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()


Step,Training Loss,Validation Loss,Accuracy
50,3.8407,3.838331,0.326933
100,2.3382,2.42801,0.543754
150,1.982,1.816732,0.621662
200,1.4364,1.466297,0.669467
250,1.2607,1.227208,0.720172
300,1.1154,1.100597,0.738774
350,0.7948,0.979722,0.762776
400,0.821,0.882488,0.768477
450,0.7193,0.849153,0.779378
500,0.7381,0.777512,0.79828


# Evaluation

In [None]:
metrics = trainer.evaluate(prepared_test)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)