In [None]:
!pip install ultralytics

In [12]:
import os
import random
import shutil
import numpy as np
import matplotlib.pyplot as plt
from ultralytics import YOLO
import torch
from sklearn.metrics import confusion_matrix
import yaml
import seaborn as sns
from collections import Counter

In [None]:
!unzip recipe_ingredients.zip -d recipe_ingredients

In [13]:
random.seed(42)

dataset_path = 'recipe_ingredients/'
data_yaml_path = os.path.join(dataset_path, 'data.yaml')

In [14]:
# A NEW FUNCTION I'M ADDING TO ANALYZE OUR DATASET AND CREATE A MORE BALANCED ONE AS OUR YOLOv8 MODELS
# HAVE BEEN POTENTIALLY TRAINING ON AN IMBALANCED DATASET

def analyze_dataset(dataset_path):
    with open(os.path.join(dataset_path, 'data.yaml'), 'r') as f:
        data_yaml = yaml.safe_load(f)
        class_names = data_yaml['names']

    train_dir = os.path.join(dataset_path, 'train', 'labels')
    class_counts = Counter()

    for filename in os.listdir(train_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(train_dir, filename), 'r') as f:
                for line in f:
                    if line.strip():
                        class_id = int(line.strip().split()[0])
                        class_counts[class_id] += 1

    plt.figure(figsize=(15, 8))
    counts = [class_counts.get(i, 0) for i in range(len(class_names))]
    class_labels = [f"{class_names[i]} ({counts[i]})" for i in range(len(class_names))]

    sorted_indices = np.argsort(counts)
    sorted_classes = [class_labels[i] for i in sorted_indices]
    sorted_counts = [counts[i] for i in sorted_indices]

    plt.barh(sorted_classes, sorted_counts)
    plt.xlabel('Number of instances')
    plt.title('Class Distribution in Training Data')
    plt.tight_layout()
    plt.savefig('class_distribution.png')

    print("Class distribution:")
    for i, name in enumerate(class_names):
        print(f"{name}: {class_counts.get(i, 0)} instances")

    few_samples = [class_names[i] for i in range(len(class_names)) if class_counts.get(i, 0) < 100]
    many_samples = [class_names[i] for i in range(len(class_names)) if class_counts.get(i, 0) > 1000]

    return class_counts, few_samples, many_samples, class_names


In [None]:
def create_balanced_dataset(dataset_path, max_per_class=300, min_per_class=50):

    with open(os.path.join(dataset_path, 'data.yaml'), 'r') as f:
        data_yaml = yaml.safe_load(f)
        class_names = data_yaml['names']

    balanced_dir = os.path.join(dataset_path, 'train_balanced')
    os.makedirs(os.path.join(balanced_dir, 'images'), exist_ok=True)
    os.makedirs(os.path.join(balanced_dir, 'labels'), exist_ok=True)

    class_counts, _, _, _ = analyze_dataset(dataset_path)

    train_dir = os.path.join(dataset_path, 'train')
    class_to_images = {i: [] for i in range(len(class_names))}

    print("Mapping images to classes...")
    for label_file in os.listdir(os.path.join(train_dir, 'labels')):
        if not label_file.endswith('.txt'):
            continue

        image_name = os.path.splitext(label_file)[0]
        img_path = None

        for ext in ['.jpg']:
            test_path = os.path.join(train_dir, 'images', image_name + ext)
            if os.path.exists(test_path):
                img_path = test_path
                break

        if img_path is None:
            continue

        with open(os.path.join(train_dir, 'labels', label_file), 'r') as f:
            image_classes = set()
            for line in f:
                if line.strip():
                    class_id = int(line.strip().split()[0])
                    image_classes.add(class_id)

        for class_id in image_classes:
            class_to_images[class_id].append((label_file, img_path))

    selected_samples = set()

    for class_id, images in class_to_images.items():
        num_samples = len(images)
        target_samples = max(min(num_samples, max_per_class), min_per_class)

        if target_samples > num_samples:
            samples_to_select = images
        else:
            samples_to_select = random.sample(images, target_samples)

        for sample in samples_to_select:
            selected_samples.add(sample)

    print(f"Creating balanced dataset with {len(selected_samples)} images")

    for label_file, img_path in selected_samples:
        src_label = os.path.join(train_dir, 'labels', label_file)
        dst_label = os.path.join(balanced_dir, 'labels', label_file)
        shutil.copy(src_label, dst_label)

        img_filename = os.path.basename(img_path)
        dst_img = os.path.join(balanced_dir, 'images', img_filename)
        shutil.copy(img_path, dst_img)

    with open(os.path.join(dataset_path, 'data.yaml'), 'r') as f:
        yaml_content = f.read()

    abs_dataset_path = os.path.abspath(dataset_path)
    abs_balanced_dir = os.path.join(abs_dataset_path, 'train_balanced')

    yaml_content = f"""
# YOLO dataset configuration file
train: {abs_balanced_dir}/images
val: {abs_dataset_path}/valid/images 
test: {abs_dataset_path}/test/images 

nc: {len(class_names)} 
names: {data_yaml['names']}
"""

    balanced_yaml_path = os.path.join(dataset_path, 'data_balanced.yaml')
    with open(balanced_yaml_path, 'w') as f:
        f.write(yaml_content)

    return balanced_yaml_path

In [20]:
def fine_tune_model(model_path, yaml_path):
    model = YOLO(model_path)

    results = model.train(
        data=yaml_path,
        epochs=15,
        patience=7,
        batch=32,
        imgsz=640,

        augment=True,
        degrees=15.0,
        translate=0.2,
        scale=0.5,
        fliplr=0.5,
        mosaic=1.0,
        mixup=0.1,

        lr0=0.0005,
        lrf=0.01,

        freeze=[0, 1, 2, 3, 4, 5]
    )

    model.export(format="pt", name="finetuned_yolov8.pt")

    return model

In [17]:
def main():
    dataset_path = "recipe_ingredients/"
    existing_model_path = "best.pt"

    class_counts, few_samples, many_samples, class_names = analyze_dataset(dataset_path)
    print(f"Classes with few samples: {few_samples}")
    print(f"Classes with many samples: {many_samples}")

    balanced_yaml_path = create_balanced_dataset(dataset_path, max_per_class=300, min_per_class=50)

    model = fine_tune_model(existing_model_path, balanced_yaml_path)

    print("Fine-tuning and testing complete")

In [21]:
if __name__ == "__main__":
    main()

Class distribution:
0: 14 instances
1: 48 instances
2: 3360 instances
3: 316 instances
4: 1506 instances
5: 1178 instances
6: 8190 instances
7: 24 instances
8: 632 instances
9: 564 instances
10: 516 instances
11: 194 instances
12: 120 instances
13: 6894 instances
14: 6532 instances
15: 858 instances
16: 80 instances
17: 21898 instances
18: 2784 instances
19: 206 instances
20: 226 instances
21: 19148 instances
22: 9354 instances
23: 4978 instances
24: 178 instances
25: 126 instances
26: 76 instances
27: 108 instances
28: 876 instances
29: 422 instances
30: 13562 instances
31: 76 instances
Classes with few samples: ['bay_leaves', 'beef', 'chickpeas', 'green_onion', 'salt', 'turmeric']
Classes with many samples: ['bell_pepper', 'carrot', 'cauliflower', 'chicken', 'garlic', 'ginger', 'kumquat', 'lemon', 'onion', 'pork', 'potato', 'tomao']
Class distribution:
0: 14 instances
1: 48 instances
2: 3360 instances
3: 316 instances
4: 1506 instances
5: 1178 instances
6: 8190 instances
7: 24 instan

100%|██████████| 755k/755k [00:00<00:00, 143MB/s]


                   from  n    params  module                                       arguments                     
  0                  -1  1       928  ultralytics.nn.modules.conv.Conv             [3, 32, 3, 2]                 
  1                  -1  1     18560  ultralytics.nn.modules.conv.Conv             [32, 64, 3, 2]                
  2                  -1  1     29056  ultralytics.nn.modules.block.C2f             [64, 64, 1, True]             
  3                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  4                  -1  2    197632  ultralytics.nn.modules.block.C2f             [128, 128, 2, True]           
  5                  -1  1    295424  ultralytics.nn.modules.conv.Conv             [128, 256, 3, 2]              
  6                  -1  2    788480  ultralytics.nn.modules.block.C2f             [256, 256, 2, True]           
  7                  -1  1   1180672  ultralytics.nn.modules.conv.Conv             [256




Model summary: 129 layers, 11,147,984 parameters, 11,147,968 gradients, 28.7 GFLOPs

Transferred 355/355 items from pretrained weights
Freezing layer 'model.0.conv.weight'
Freezing layer 'model.0.bn.weight'
Freezing layer 'model.0.bn.bias'
Freezing layer 'model.1.conv.weight'
Freezing layer 'model.1.bn.weight'
Freezing layer 'model.1.bn.bias'
Freezing layer 'model.2.cv1.conv.weight'
Freezing layer 'model.2.cv1.bn.weight'
Freezing layer 'model.2.cv1.bn.bias'
Freezing layer 'model.2.cv2.conv.weight'
Freezing layer 'model.2.cv2.bn.weight'
Freezing layer 'model.2.cv2.bn.bias'
Freezing layer 'model.2.m.0.cv1.conv.weight'
Freezing layer 'model.2.m.0.cv1.bn.weight'
Freezing layer 'model.2.m.0.cv1.bn.bias'
Freezing layer 'model.2.m.0.cv2.conv.weight'
Freezing layer 'model.2.m.0.cv2.bn.weight'
Freezing layer 'model.2.m.0.cv2.bn.bias'
Freezing layer 'model.3.conv.weight'
Freezing layer 'model.3.bn.weight'
Freezing layer 'model.3.bn.bias'
Freezing layer 'model.4.cv1.conv.weight'
Freezing layer 'm

100%|██████████| 5.35M/5.35M [00:00<00:00, 301MB/s]


[34m[1mAMP: [0mchecks passed ✅
[34m[1mtrain: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1098.6±336.8 MB/s, size: 58.2 KB)


[34m[1mtrain: [0mScanning /content/recipe_ingredients/train_balanced/labels... 8005 images, 0 backgrounds, 1 corrupt: 100%|██████████| 8005/8005 [00:04<00:00, 1619.85it/s]

[34m[1mtrain: [0m/content/recipe_ingredients/train_balanced/images/3255_jpg.rf.80dc6984b2ee3cf6b235129c1cbbade7.jpg: ignoring corrupt image/label: non-normalized or out of bounds coordinates [     1.0103]





[34m[1mtrain: [0mNew cache created: /content/recipe_ingredients/train_balanced/labels.cache
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 434.9±233.0 MB/s, size: 43.0 KB)


[34m[1mval: [0mScanning /content/recipe_ingredients/valid/labels... 816 images, 0 backgrounds, 0 corrupt: 100%|██████████| 816/816 [00:01<00:00, 616.83it/s]


[34m[1mval: [0mNew cache created: /content/recipe_ingredients/valid/labels.cache
Plotting labels to runs/detect/train2/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.0005' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000278, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 2 dataloader workers
Logging results to [1mruns/detect/train2[0m
Starting training for 15 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/15       5.4G      1.245     0.8623      1.365         45        640: 100%|██████████| 251/251 [03:23<00:00,  1.24it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:11<00:00,  1.13it/s]

                   all        816      10259      0.901      0.846      0.873      0.523






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/15      6.95G       1.12     0.7855      1.292         56        640: 100%|██████████| 251/251 [03:20<00:00,  1.25it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:10<00:00,  1.19it/s]

                   all        816      10259      0.862      0.825      0.859      0.575






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/15      6.99G      1.105     0.7848      1.286         70        640: 100%|██████████| 251/251 [03:16<00:00,  1.28it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:10<00:00,  1.22it/s]

                   all        816      10259      0.865      0.831      0.826      0.529






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/15      7.01G      1.091     0.7803       1.28        103        640: 100%|██████████| 251/251 [03:17<00:00,  1.27it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:10<00:00,  1.25it/s]

                   all        816      10259      0.898      0.852      0.876      0.589






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/15      7.06G      1.072     0.7605       1.27         41        640: 100%|██████████| 251/251 [03:15<00:00,  1.28it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:10<00:00,  1.22it/s]

                   all        816      10259      0.933      0.812      0.853       0.57





Closing dataloader mosaic
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/15      7.08G     0.9543     0.5177      1.215         56        640: 100%|██████████| 251/251 [02:23<00:00,  1.75it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:09<00:00,  1.35it/s]

                   all        816      10259      0.838      0.868      0.868      0.604






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/15      7.13G     0.9378     0.4961      1.201         27        640: 100%|██████████| 251/251 [02:16<00:00,  1.84it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:10<00:00,  1.30it/s]

                   all        816      10259      0.864      0.848      0.889      0.616






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/15      7.15G     0.9202     0.4783      1.187         20        640: 100%|██████████| 251/251 [02:18<00:00,  1.81it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:09<00:00,  1.40it/s]

                   all        816      10259      0.872      0.866       0.91      0.629






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/15       7.2G     0.9021     0.4642      1.176         40        640: 100%|██████████| 251/251 [02:16<00:00,  1.84it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:09<00:00,  1.34it/s]

                   all        816      10259       0.91       0.87       0.88      0.634






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/15      7.22G     0.8837     0.4507      1.164         48        640: 100%|██████████| 251/251 [02:18<00:00,  1.81it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:09<00:00,  1.44it/s]

                   all        816      10259      0.827      0.902      0.912      0.655






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      11/15      7.26G     0.8728     0.4399      1.156         24        640: 100%|██████████| 251/251 [02:18<00:00,  1.82it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:09<00:00,  1.33it/s]

                   all        816      10259      0.866      0.874        0.9      0.651






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      12/15      7.29G     0.8511     0.4278      1.142         22        640: 100%|██████████| 251/251 [02:19<00:00,  1.81it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:08<00:00,  1.46it/s]

                   all        816      10259      0.922      0.854      0.888      0.651






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      13/15      7.33G     0.8436     0.4222      1.138         19        640: 100%|██████████| 251/251 [02:19<00:00,  1.80it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:09<00:00,  1.33it/s]

                   all        816      10259      0.901      0.873      0.907      0.661






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      14/15      7.36G     0.8244     0.4095      1.123         31        640: 100%|██████████| 251/251 [02:19<00:00,  1.79it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:09<00:00,  1.38it/s]

                   all        816      10259       0.89      0.878      0.916      0.674






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      15/15       7.4G     0.8189     0.4009      1.118         51        640: 100%|██████████| 251/251 [02:18<00:00,  1.82it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:09<00:00,  1.35it/s]

                   all        816      10259      0.896      0.882      0.916      0.677






15 epochs completed in 0.706 hours.
Optimizer stripped from runs/detect/train2/weights/last.pt, 22.5MB
Optimizer stripped from runs/detect/train2/weights/best.pt, 22.5MB

Validating runs/detect/train2/weights/best.pt...
Ultralytics 8.3.116 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
Model summary (fused): 72 layers, 11,137,968 parameters, 0 gradients, 28.5 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:21<00:00,  1.65s/it]


                   all        816      10259      0.916      0.875      0.925      0.674
                  beef          2          4          1          0      0.537      0.404
           bell_pepper        112        115      0.909      0.965      0.947      0.681
               cabbage          1          1       0.65          1      0.995      0.895
                carrot         83         92      0.932      0.946      0.957      0.661
               chicken        121        435      0.959      0.957      0.972      0.771
              cucumber         31         31      0.966      0.916      0.989      0.824
              eggplant          7          7          1      0.953      0.995      0.808
                garlic        302        510       0.95      0.976      0.982      0.684
                ginger        417        765      0.964      0.992      0.972      0.834
    green_chili_pepper         58         58      0.933      0.968      0.978      0.667
           green_onio

SyntaxError: '[31m[1mfilename[0m' is not a valid YOLO argument. Similar arguments are i.e. ['name'].

    Arguments received: ['yolo', '-f', '/root/.local/share/jupyter/runtime/kernel-06cb9951-e7d2-4877-ad09-0efd0f8473c2.json']. Ultralytics 'yolo' commands use the following syntax:

        yolo TASK MODE ARGS

        Where   TASK (optional) is one of frozenset({'obb', 'classify', 'segment', 'pose', 'detect'})
                MODE (required) is one of frozenset({'predict', 'export', 'benchmark', 'track', 'train', 'val'})
                ARGS (optional) are any number of custom 'arg=value' pairs like 'imgsz=320' that override defaults.
                    See all ARGS at https://docs.ultralytics.com/usage/cfg or with 'yolo cfg'

    1. Train a detection model for 10 epochs with an initial learning_rate of 0.01
        yolo train data=coco8.yaml model=yolo11n.pt epochs=10 lr0=0.01

    2. Predict a YouTube video using a pretrained segmentation model at image size 320:
        yolo predict model=yolo11n-seg.pt source='https://youtu.be/LNwODJXcvt4' imgsz=320

    3. Val a pretrained detection model at batch-size 1 and image size 640:
        yolo val model=yolo11n.pt data=coco8.yaml batch=1 imgsz=640

    4. Export a YOLO11n classification model to ONNX format at image size 224 by 128 (no TASK required)
        yolo export model=yolo11n-cls.pt format=onnx imgsz=224,128

    5. Ultralytics solutions usage
        yolo solutions count or in ['crop', 'blur', 'workout', 'heatmap', 'isegment', 'visioneye', 'speed', 'queue', 'analytics', 'inference', 'trackzone'] source="path/to/video.mp4"

    6. Run special commands:
        yolo help
        yolo checks
        yolo version
        yolo settings
        yolo copy-cfg
        yolo cfg
        yolo solutions help

    Docs: https://docs.ultralytics.com
    Solutions: https://docs.ultralytics.com/solutions/
    Community: https://community.ultralytics.com
    GitHub: https://github.com/ultralytics/ultralytics
     (<string>)

We chose the YOLOv8 model because this particular dataset:

https://universe.roboflow.com/food-w4zm1/recipe-ingredients-cn

for the purposes of our app--identifying individual ingredients from an image of a group of ingredients--had exactly what we needed. An image of groups of ingredients, labeled with what ingredient they were and at what coordinates (boxed boundaries) they were located in the images.

We chose the particular YOLOv8 model, because, it seemed to be the latest version that supported this particular dataset for our particular task (object detection).

We spent a lot of time attempting to train our models on our own systems--and we managed to get it to recognize and "use" our GPUs, but we seemed to have run into bottleneck issues of sorts that failed to diagnose. Eventually, we used Google Colab, which, thankfully, does give us some limited usage with their Tesla 4 GPU, which trained our models quite fast.

We'd only trained/validated/tested on a subset of the dataset:

3957/15829 total images for training
244/816 total images for validation
221/1109 total images for testing

Because of the limited GPU usage provided by Google Colab.

We used image sizes of 640x640, because we didn't want them too small--as the model may have failed to properly recognize the images, and we didn't want them too large--as training may have taken too long.

Ultralytic's .val() metrics returned 4 values:

Box(P) -- Precision of bounding boxes; this measures the "accuracy" of the predicted boxes for the ingredients within an image = (0.878 final or 87.8%)

R -- Recall; measures what % of objects are detected (0.879 final or 87.9%)

mAP50 -- Mean Average Precision (at IoU threshold 0.5); quantifies the overall performance of our model (0.907 final or 90.7%)

mAP50-95 -- Mean Average Precision avged over multiple IoU thresholds; more stringent/severe quantification of our model's performance (0.645 final or 64.5%)

Class Performance:

* High-performing classes:
    * Cucumber: 0.995 mAP50 or 99.5%
    * Tomato: 0.994 mAP50 or 99.4%
    * Kumquat: 0.993 mAP50 or 99.3%
    * Potato: 0.987 mAP50 or 98.7%
* Low-performing classes:
    * Green Onion: 0.376 mAP50 (only 3 instances in validation)

We could train some more to address issues like green onion, but again, Google Colab only provides us a few uses and its "reset schedule" is unpredictable as it is undocumented.