# 1. Introduction
This notebook prepares the images for a new class called "unknown". This class will be build using images from the [iFood-2019 dataset](https://www.kaggle.com/competitions/ifood-2019-fgvc6/data) dataset that contains 251 food types. This dataset can be considered as an extension of our target Food-101 dataset, so some of these 251 types already exist in the Food-101 dataset.

# 2. Unknown Dataset Generation

Importing libraries and creating taget directories.

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import shutil
import nltk
import random

from torchvision.transforms import v2
from pathlib import Path
from modules.dataloaders import create_dataloaders
from bing_image_downloader import downloader

# Define some constants
NUM_WORKERS = os.cpu_count()
BATCH_SIZE = 64
AMOUNT_TO_GET = 1.0
SEED = 42

# Define target data directory
target_dir_food101_name = f"../data/food-101_{str(int(AMOUNT_TO_GET*100))}_percent"
target_dir_food101_name_unknown = f"../data/food-101_{str(int(AMOUNT_TO_GET*100))}_percent_unknown_2"

# Setup training and test directories
target_dir_food101 = Path(target_dir_food101_name)
train_dir_food101 = target_dir_food101 / "train"
test_dir_food101 = target_dir_food101 / "test"

# Create unknown directores
target_dir_food101_unknown = Path(target_dir_food101_name_unknown)
train_dir_food101_unknown = target_dir_food101_unknown / "train" / "unknown"
test_dir_food101_unknown = target_dir_food101_unknown / "test" / "unknown"
target_dir_food101_unknown.mkdir(parents=True, exist_ok=True)
train_dir_food101_unknown.mkdir(parents=True, exist_ok=True)
test_dir_food101_unknown.mkdir(parents=True, exist_ok=True)

# Create target model directory
model_dir = Path("../models")

Getting the class names for the Food101 dataset.

In [2]:
# Image size
IMG_SIZE = 384

# Manual transforms for the training dataset
manual_transforms = v2.Compose([           
    v2.RandomCrop((IMG_SIZE, IMG_SIZE)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),    
])

# ViT-Base transforms
# Manual transforms for the training dataset
manual_transforms_aug_norm_train_vitb = v2.Compose([    
    v2.TrivialAugmentWide(),
    v2.Resize((IMG_SIZE, IMG_SIZE)),
    v2.CenterCrop((IMG_SIZE, IMG_SIZE)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]) 
])

# Manual transforms for the test dataset
manual_transforms_aug_norm_test_vitb = v2.Compose([    
    v2.Resize((IMG_SIZE, IMG_SIZE)),
    v2.CenterCrop((IMG_SIZE, IMG_SIZE)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]) 
])

# Get the class names for the Food101 dataset
_, _, classes_food101_list = create_dataloaders(
    train_dir=train_dir_food101,
    test_dir=test_dir_food101,
    train_transform=manual_transforms_aug_norm_train_vitb,
    test_transform=manual_transforms_aug_norm_test_vitb,
    batch_size=BATCH_SIZE
)

In [3]:
n_samples_train = [len(list(Path(train_dir_food101).glob(f"**/{classes}/*.jpg"))) for classes in classes_food101_list]
n_samples_test = [len(list(Path(test_dir_food101).glob(f"**/{classes}/*.jpg"))) for classes in classes_food101_list]
#test_image_path_list = list(Path(train_dir).glob("**/apple_pie/*.jpg")) # get list all image paths from test data 
#test_image_path_list
len(n_samples_train), len(n_samples_test)

(101, 101)

Processing the ifood-2019 dataset.

In [4]:
target_dir_food251_name = "../data/ifood-2019-fgvc6"

# Setup training and test directories
target_dir_food251 = Path(target_dir_food251_name)
train_dir_food251 = target_dir_food251 / "train_set"
val_dir_food251 = target_dir_food251 / "val_set"

# Path to the file
class_file_path = target_dir_food251 / "class_list.txt"
train_labels_path = target_dir_food251 / "train_labels.csv"
val_labels_path = target_dir_food251 / "val_labels.csv"

# Initialize an empty list to store class names
classes_food251_dict = {}

# Open and read the file
with open(class_file_path, "r") as file:
    for line in file:
        # Split the line into components
        parts = line.strip().split(" ", 1)
        if len(parts) > 1:
            classes_food251_dict.update({int(parts[0]): parts[1]})            

# Print the result
print(classes_food251_dict)

{0: 'macaron', 1: 'beignet', 2: 'cruller', 3: 'cockle_food', 4: 'samosa', 5: 'tiramisu', 6: 'tostada', 7: 'moussaka', 8: 'dumpling', 9: 'sashimi', 10: 'knish', 11: 'croquette', 12: 'couscous', 13: 'porridge', 14: 'stuffed_cabbage', 15: 'seaweed_salad', 16: 'chow_mein', 17: 'rigatoni', 18: 'beef_tartare', 19: 'cannoli', 20: 'foie_gras', 21: 'cupcake', 22: 'osso_buco', 23: 'pad_thai', 24: 'poutine', 25: 'ramen', 26: 'pulled_pork_sandwich', 27: 'bibimbap', 28: 'chicken_kiev', 29: 'apple_pie', 30: 'risotto', 31: 'fruitcake', 32: 'chop_suey', 33: 'haggis', 34: 'scrambled_eggs', 35: 'frittata', 36: 'scampi', 37: 'sushi', 38: 'orzo', 39: 'fritter', 40: 'nacho', 41: 'beef_stroganoff', 42: 'beef_wellington', 43: 'spring_roll', 44: 'savarin', 45: 'crayfish_food', 46: 'souffle', 47: 'adobo', 48: 'streusel', 49: 'deviled_egg', 50: 'escargot', 51: 'club_sandwich', 52: 'carrot_cake', 53: 'falafel', 54: 'farfalle', 55: 'terrine', 56: 'poached_egg', 57: 'gnocchi', 58: 'bubble_and_squeak', 59: 'egg_rol

Identifying the remaining classes—those in iFood-2019 that do not belong to Food-101—also involves cleaning the class names, such as through lemmatization, to achieve a good match.

In [5]:
# Apply lemmatization
lemmatizer = nltk.WordNetLemmatizer()
classes_food101_lem_list = [lemmatizer.lemmatize(word) for word in classes_food101_list]
classes_food251_lem_dict = {key: lemmatizer.lemmatize(word) for key, word in classes_food251_dict.items()}

In [6]:
# Look the classes in food101 that still end with "s" and replace them
classes_food101_lem_list = [c[:-1] if c.endswith("s") else c for c in classes_food101_list]

# And check it out
for c in classes_food101_lem_list:
    if c.endswith("s"):
        print(c)

classes_food101_lem_list = [c.replace("_", "") for c in classes_food101_lem_list]

In [7]:
# Look the classes in food251 that still end with "s" and replace them
classes_food251_lem_dict = {key: c[:-1] if c.endswith("s") else c for key, c in classes_food251_dict.items()}

# And check it out
for c in classes_food251_lem_dict.values():
    if c.endswith("s"):
        print(c)
classes_food251_lem_dict = {key: c.replace("_", "") for key, c in classes_food251_lem_dict.items()}

# Create a new dictionary excluding related classes
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if val != 'entrecote'}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if val != 'tenderloin'}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if val != 'brisket'}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if val != 'biryani'}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if val != 'meatball'}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if val != 'reuben'}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if val != 'schnitzel'}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if val != 'sukiyaki'}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if val != 'chowmein'}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if not('steak' in val)}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if not('sandwich' in val)}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if not('salad' in val)}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if not('rib' in val)}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if not('chicken' in val)}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if not('beef' in val)}

classes_food251_lem_dict = {
    key: val 
    for key, val in classes_food251_lem_dict.items() 
    if not any(val in food for food in classes_food101_lem_list)
}

Specify how many images per remaining class should contain the new unknown class.

In [8]:
# Specify how many images per remaining class should contain the new unknown class
remaining_classes = set(classes_food251_lem_dict.values()) - set(classes_food101_lem_list)
remaining_classes_dict = {key: value for key, value in classes_food251_lem_dict.items() if value in remaining_classes}
n_samples_per_remaining_class_train = int(np.mean(n_samples_train) / len(remaining_classes))
n_samples_per_remaining_class_test = int(np.mean(n_samples_test) / len(remaining_classes))
print(f"Number of remaining classes: {len(remaining_classes_dict)}")
print(f"Number of samples per remaining class for training: {n_samples_per_remaining_class_train}")
print(f"Number of samples per remaining class for training: {n_samples_per_remaining_class_test}")

Number of remaining classes: 128
Number of samples per remaining class for training: 5
Number of samples per remaining class for training: 1


Create a data frame that contains the image name, label, and class name of the remaning image dataset.

In [9]:
# Create a dataframe of the reamining classes  
df_remaining_classes = pd.DataFrame(remaining_classes_dict.items(), columns=['label', 'class'])
df_train_labels = pd.read_csv(train_labels_path)
df_val_labels = pd.read_csv(val_labels_path)

In [10]:
# Extend the dataframe with labels
df_remaining_train_labels = df_train_labels.merge(df_remaining_classes, how='right', on='label')
df_remaining_val_labels = df_val_labels.merge(df_remaining_classes, how='right', on='label')

In [11]:
# Explore the result
df_remaining_train_labels.head(10)

Unnamed: 0,img_name,label,class
0,train_062355.jpg,2,cruller
1,train_062356.jpg,2,cruller
2,train_062357.jpg,2,cruller
3,train_062358.jpg,2,cruller
4,train_062359.jpg,2,cruller
5,train_062360.jpg,2,cruller
6,train_062361.jpg,2,cruller
7,train_062362.jpg,2,cruller
8,train_062363.jpg,2,cruller
9,train_062364.jpg,2,cruller


Copies a random selection of image files from the source directory to the destination directory based on labels provided in a DataFrame.

In [12]:
def copy_random_samples(df, source_dir, destination_dir, n_samples_per_class, seed):
    """
    Copy random samples from the source directory to the destination directory.

    Args:
        df (pandas.DataFrame): DataFrame containing the labels and image names.
        source_dir (str): Path to the source directory.
        destination_dir (str): Path to the destination directory.
        n_samples_per_class (int): Number of samples to copy per class.
        seed (int): Random seed for reproducibility.

    Returns:
        None
    """
    # Ensure the destination directory exists
    os.makedirs(destination_dir, exist_ok=True)

    # Loop over each label
    for _, group in df.groupby('label'):
        # Sample the group and take the image names 
        selected_files = group.sample(n=n_samples_per_class, random_state=seed)['img_name'].tolist()
        # Copy the selected files into the destination directory
        for file in selected_files:
            source_path = os.path.join(source_dir, file)
            destination_path = os.path.join(destination_dir, file)
            shutil.copy(source_path, destination_path)

copy_random_samples(df_remaining_train_labels, train_dir_food251, train_dir_food101_unknown, n_samples_per_remaining_class_train, SEED)
copy_random_samples(df_remaining_val_labels, val_dir_food251, test_dir_food101_unknown, n_samples_per_remaining_class_test, SEED)

# 3. Extented the Unknow Class with New Images

In [13]:
# Download other typical food types to be added to the unknown category
other_images = ['capuccino cup', 'coffee', 'banana', 'obst', 'apfel', 'orange fruit', 'fruit basket', 'smoothie', 'dorade', 'kabeljau']
#for item in other_images:
#    downloader.download(item, limit=25, output_dir='images', adult_filter_off=True, force_replace=False, timeout=60, filter="photo, clipart", verbose=True)

In [14]:
# Create "train" and "test" directories if they don't exist
other_image_folder = Path('images')
other_image_folder_train = other_image_folder / 'train'
other_image_folder_test = other_image_folder / 'test'
other_image_folder_train.mkdir(parents=True, exist_ok=True)
other_image_folder_test.mkdir(parents=True, exist_ok=True)  

# Loop through each category in "other_images"
for category in other_images:
    # List the files for this category (assumes the images are named according to the category)
    category_folder = os.path.join(other_image_folder, category)
    
    # Check if the category folder exists
    if os.path.exists(category_folder):
        # Get all image filenames for this category
        images_orig = [img for img in os.listdir(category_folder) if img.lower().endswith(('.jpg', '.jpeg'))]
        images_renamed = [image.replace('.',f'_{category}.') for image in images_orig]
        images_renamed = [image.replace(' ','_') for image in images_renamed]

        # Shuffle the image filenames randomly
        random.seed(SEED+random.randint(1, 1000))
        random.shuffle(images_orig)
        random.shuffle(images_renamed)


        # Move 6 images to the "train" folder
        for idx, img in enumerate(images_orig[:6]):
            src_orig = os.path.join(category_folder, img)
            dst_orig = os.path.join(train_dir_food101_unknown, images_renamed[idx])
            dst_renamed = os.path.join(train_dir_food101_unknown, images_renamed[idx])
            shutil.copy(src_orig, dst_orig)
            shutil.move(dst_orig, dst_renamed)

        # Move 2 images to the "test" folder
        for idx, img in enumerate(images_orig[6:8]):
            src_orig = os.path.join(category_folder, img)
            dst_orig = os.path.join(test_dir_food101_unknown, images_renamed[idx])
            dst_renamed = os.path.join(test_dir_food101_unknown, images_renamed[idx])
            shutil.copy(src_orig, dst_orig)
            shutil.move(dst_orig, dst_renamed)

        print(f"Moved 6 images of {category} to 'train' and 2 images to 'test'.")
    else:
        print(f"Category folder '{category}' not found. Skipping...")

Moved 6 images of capuccino cup to 'train' and 2 images to 'test'.
Moved 6 images of coffee to 'train' and 2 images to 'test'.
Moved 6 images of banana to 'train' and 2 images to 'test'.
Moved 6 images of obst to 'train' and 2 images to 'test'.
Moved 6 images of apfel to 'train' and 2 images to 'test'.
Moved 6 images of orange fruit to 'train' and 2 images to 'test'.
Moved 6 images of fruit basket to 'train' and 2 images to 'test'.
Moved 6 images of smoothie to 'train' and 2 images to 'test'.
Moved 6 images of dorade to 'train' and 2 images to 'test'.
Moved 6 images of kabeljau to 'train' and 2 images to 'test'.


In [15]:
# Paths to your source and destination directories
source_dir_train = '../data/ADEChallengeData2016/images/training'
source_dir_test = '../data/ADEChallengeData2016/images/validation'
destination_dir = 'path/to/destination/directory'

# Get a list of all files in the source directory
all_files_train = [f for f in os.listdir(source_dir_train) if os.path.isfile(os.path.join(source_dir_train, f))]
all_files_test = [f for f in os.listdir(source_dir_test) if os.path.isfile(os.path.join(source_dir_test, f))]

# Filter the list to include only image files (you can adjust the extensions based on your images)
image_files_train = [f for f in all_files_train if f.lower().endswith(('.jpg', '.jpeg'))]
image_files_test = [f for f in all_files_test if f.lower().endswith(('.jpg', '.jpeg'))]

# Randomly select 50 images
random.seed(SEED+random.randint(1, 1000))
selected_images_train = random.sample(image_files_train, 45)
selected_images_test = random.sample(image_files_test, 8)

# Copy each selected image to the destination directory
for image in selected_images_train:
    shutil.copy(os.path.join(source_dir_train, image), train_dir_food101_unknown)

print(f"Copied {len(selected_images_train)} images to {train_dir_food101_unknown}")

# Copy each selected image to the destination directory
for image in selected_images_test:
    shutil.copy(os.path.join(source_dir_test, image), test_dir_food101_unknown)

print(f"Copied {len(selected_images_test)} images to {test_dir_food101_unknown}")



Copied 45 images to ..\data\food-101_100_percent_unknown_2\train\unknown
Copied 8 images to ..\data\food-101_100_percent_unknown_2\test\unknown
