In [1]:
from datasets import load_dataset
ds = load_dataset("imagefolder", data_dir = './foodimg')
ds = ds['train']

Resolving data files:   0%|          | 0/9171 [00:00<?, ?it/s]

In [2]:
data = ds.train_test_split(test_size = 0.2)

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 7335
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1834
    })
})

In [4]:
from datasets import load_metric
metric = load_metric("accuracy")

  metric = load_metric("accuracy")


In [5]:
labels = data['train'].features['label'].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

In [6]:
from transformers import AutoImageProcessor

model_name_or_path = 'microsoft/resnet-18'
feature_extractor = AutoImageProcessor.from_pretrained(model_name_or_path)

2024-07-24 15:35:53.285643: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-24 15:35:53.577615: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-24 15:35:54.340971: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/shagun/miniconda3/envs/tf/lib/
2024-07-24 15:35:54.341074: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugi

In [7]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor
)

size = feature_extractor.size  # why this code block
if "height" in size:
    crop_size = (size["height"], size["width"])
    resize_size = (size["height"], size["width"])
elif "shortest_edge" in size:
    crop_size = resize_size = size["shortest_edge"]

normalize = Normalize(mean = feature_extractor.image_mean, std = feature_extractor.image_std)
train_transforms = Compose(
        [
            RandomResizedCrop(crop_size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize
        ]
    )

val_transforms = Compose(
        [
            Resize(resize_size),
            CenterCrop(crop_size),
            ToTensor(),
            normalize
        ]
    )

def preprocess_train(example_batch):
    example_batch["pixel_values"] = [train_transforms(image.convert("RGB")) for image in example_batch["image"]]
    return example_batch

def preprocess_val(example_batch):
    example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]]
    return example_batch

In [8]:
train_ds = data['train']
val_ds = data['test']

In [9]:
train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

In [10]:
train_ds[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=751x532>,
 'label': 16,
 'pixel_values': tensor([[[-0.9705, -1.1760, -1.5357,  ..., -1.6898, -1.8782, -1.8782],
          [-0.7993, -1.1589, -1.2617,  ..., -1.6727, -1.5528, -1.5528],
          [-0.5082, -0.6281, -0.6794,  ..., -1.4843, -1.4329, -1.2788],
          ...,
          [-1.6727, -1.7240, -1.6727,  ...,  0.9646,  0.9132,  0.8789],
          [-1.5870, -1.6555, -1.6384,  ...,  0.9646,  0.9474,  0.9132],
          [-1.5699, -1.5699, -1.5870,  ...,  0.9474,  0.9474,  0.9474]],
 
         [[-1.1078, -1.2479, -1.5280,  ..., -1.4930, -1.6856, -1.7556],
          [-0.9678, -1.2654, -1.3179,  ..., -1.5455, -1.4405, -1.4755],
          [-0.7227, -0.7752, -0.7577,  ..., -1.5280, -1.4755, -1.3179],
          ...,
          [-1.5630, -1.6155, -1.5630,  ...,  0.8179,  0.7654,  0.7304],
          [-1.4755, -1.5630, -1.5455,  ...,  0.8179,  0.8004,  0.7654],
          [-1.4930, -1.4930, -1.5105,  ...,  0.8004,  0.8004,  0.8004]

In [11]:
from transformers import ResNetForImageClassification
# model_name_or_path = 'microsoft/resnet-18'

model = ResNetForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes = True
)

Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-18 and are newly initialized because the shapes did not match:
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([28]) in the model instantiated
- classifier.1.weight: found shape torch.Size([1000, 512]) in the checkpoint and torch.Size([28, 512]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir = './results',
  per_device_train_batch_size=16,
  eval_strategy="epoch",
  save_strategy="epoch",
  num_train_epochs=10,
  fp16=True,
  # save_steps=100,
  # eval_steps=100,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  # push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
  # hub_strategy="end"
)

In [13]:
import numpy as np

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [14]:
import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

In [15]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [16]:
train_results = trainer.train()
trainer.save_model('./food_classification')
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

  return F.conv2d(input, weight, bias, self.stride,


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2984,0.953292,0.721374
2,1.0066,0.713048,0.78735
3,0.7182,0.638691,0.816794
4,0.7269,0.575935,0.846238
5,0.5371,0.513636,0.856052
6,0.5696,0.461464,0.868593
7,0.4594,0.442861,0.87132
8,0.3783,0.455999,0.863686
9,0.4272,0.400323,0.888768
10,0.3817,0.394745,0.88386


***** train metrics *****
  epoch                    =        10.0
  total_flos               = 690451016GF
  train_loss               =      0.7512
  train_runtime            =  0:16:09.26
  train_samples_per_second =      75.676
  train_steps_per_second   =       4.736


In [17]:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =       10.0
  eval_accuracy           =     0.8839
  eval_loss               =     0.3947
  eval_runtime            = 0:00:18.73
  eval_samples_per_second =     97.867
  eval_steps_per_second   =     12.273


In [18]:
# kwargs = {
#     "finetuned_from": model.config._name_or_path,
#     "tasks": "image-classification",
#     "dataset": 'food_images',
#     "tags": ['image-classification'],
# }

# trainer.create_model_card(**kwargs)

In [19]:
model

ResNetForImageClassification(
  (resnet): ResNetModel(
    (embedder): ResNetEmbeddings(
      (embedder): ResNetConvLayer(
        (convolution): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        (normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (activation): ReLU()
      )
      (pooler): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    )
    (encoder): ResNetEncoder(
      (stages): ModuleList(
        (0): ResNetStage(
          (layers): Sequential(
            (0): ResNetBasicLayer(
              (shortcut): Identity()
              (layer): Sequential(
                (0): ResNetConvLayer(
                  (convolution): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
                  (normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                  (activation): ReLU()
           