# 1. Introduction
This notebook prepares the images for a new class called "unknown". This class will be build using images from the [iFood-2019 dataset](https://www.kaggle.com/competitions/ifood-2019-fgvc6/data) dataset that contains 251 food types. This dataset can be considered as an extension of our target Food-101 dataset, so some of these 251 types already exist in the Food-101 dataset.

# 2. Unknown Dataset Generation

Importing libraries and creating taget directories.

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import shutil
import nltk
import random

from torchvision.transforms import v2
from pathlib import Path
from modules.helper_functions import create_dataloaders
from bing_image_downloader import downloader

# Define some constants
NUM_WORKERS = os.cpu_count()
BATCH_SIZE = 64
AMOUNT_TO_GET = 1.0
SEED = 42

# Define target data directory
target_dir_food101_name = f"../data/food-101_{str(int(AMOUNT_TO_GET*100))}_percent"
target_dir_food101_name_unknown = f"../data/food-101_{str(int(AMOUNT_TO_GET*100))}_percent_unknown"

# Setup training and test directories
target_dir_food101 = Path(target_dir_food101_name)
train_dir_food101 = target_dir_food101 / "train"
test_dir_food101 = target_dir_food101 / "test"

# Create unknown directores
target_dir_food101_unknown = Path(target_dir_food101_name_unknown)
train_dir_food101_unknown = target_dir_food101_unknown / "train" / "unknown"
test_dir_food101_unknown = target_dir_food101_unknown / "test" / "unknown"
target_dir_food101_unknown.mkdir(parents=True, exist_ok=True)
train_dir_food101_unknown.mkdir(parents=True, exist_ok=True)
test_dir_food101_unknown.mkdir(parents=True, exist_ok=True)

# Create target model directory
model_dir = Path("../models")

Getting the class names for the Food101 dataset.

In [2]:
# Image size
IMG_SIZE = 384

# Manual transforms for the training dataset
manual_transforms = v2.Compose([           
    v2.RandomCrop((IMG_SIZE, IMG_SIZE)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),    
])

# ViT-Base transforms
# Manual transforms for the training dataset
manual_transforms_aug_norm_train_vitb = v2.Compose([    
    v2.TrivialAugmentWide(),
    v2.Resize((IMG_SIZE, IMG_SIZE)),
    v2.CenterCrop((IMG_SIZE, IMG_SIZE)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]) 
])

# Manual transforms for the test dataset
manual_transforms_aug_norm_test_vitb = v2.Compose([    
    v2.Resize((IMG_SIZE, IMG_SIZE)),
    v2.CenterCrop((IMG_SIZE, IMG_SIZE)),    
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]) 
])

# Get the class names for the Food101 dataset
_, _, classes_food101_list = create_dataloaders(
    train_dir=train_dir_food101,
    test_dir=test_dir_food101,
    train_transform=manual_transforms_aug_norm_train_vitb,
    test_transform=manual_transforms_aug_norm_test_vitb,
    batch_size=BATCH_SIZE
)

In [None]:
n_samples_train = [len(list(Path(train_dir_food101).glob(f"**/{classes}/*.jpg"))) for classes in classes_food101_list]
n_samples_test = [len(list(Path(test_dir_food101).glob(f"**/{classes}/*.jpg"))) for classes in classes_food101_list]
#test_image_path_list = list(Path(train_dir).glob("**/apple_pie/*.jpg")) # get list all image paths from test data 
#test_image_path_list
len(n_samples_train), len(n_samples_test)

Processing the ifood-2019 dataset.

In [None]:
target_dir_food251_name = "../data/ifood-2019-fgvc6"

# Setup training and test directories
target_dir_food251 = Path(target_dir_food251_name)
train_dir_food251 = target_dir_food251 / "train_set"
val_dir_food251 = target_dir_food251 / "val_set"

# Path to the file
class_file_path = target_dir_food251 / "class_list.txt"
train_labels_path = target_dir_food251 / "train_labels.csv"
val_labels_path = target_dir_food251 / "val_labels.csv"

# Initialize an empty list to store class names
classes_food251_dict = {}

# Open and read the file
with open(class_file_path, "r") as file:
    for line in file:
        # Split the line into components
        parts = line.strip().split(" ", 1)
        if len(parts) > 1:
            classes_food251_dict.update({int(parts[0]): parts[1]})            

# Print the result
print(classes_food251_dict)

Identifying the remaining classes—those in iFood-2019 that do not belong to Food-101—also involves cleaning the class names, such as through lemmatization, to achieve a good match.

In [5]:
# Apply lemmatization
lemmatizer = nltk.WordNetLemmatizer()
classes_food101_lem_list = [lemmatizer.lemmatize(word) for word in classes_food101_list]
classes_food251_lem_dict = {key: lemmatizer.lemmatize(word) for key, word in classes_food251_dict.items()}

In [7]:
# Look the classes in food101 that still end with "s" and replace them
classes_food101_lem_list = [c[:-1] if c.endswith("s") else c for c in classes_food101_list]

# And check it out
for c in classes_food101_lem_list:
    if c.endswith("s"):
        print(c)

classes_food101_lem_list = [c.replace("_", "") for c in classes_food101_lem_list]

In [8]:
# Look the classes in food251 that still end with "s" and replace them
classes_food251_lem_dict = {key: c[:-1] if c.endswith("s") else c for key, c in classes_food251_dict.items()}

# And check it out
for c in classes_food251_lem_dict.values():
    if c.endswith("s"):
        print(c)
classes_food251_lem_dict = {key: c.replace("_", "") for key, c in classes_food251_lem_dict.items()}

# Create a new dictionary excluding steak related classes, because the "steak" class in food101 is too generic
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if val != 'entrecote'}
classes_food251_lem_dict = {key: val for key, val in classes_food251_lem_dict.items() if not('steak' in val)}
classes_food251_lem_dict = {
    key: val 
    for key, val in classes_food251_lem_dict.items() 
    if not any(val in food for food in classes_food101_lem_list)
}

Specify how many images per remaining class should contain the new unknown class.

In [None]:
# Specify how many images per remaining class should contain the new unknown class
remaining_classes = set(classes_food251_lem_dict.values()) - set(classes_food101_lem_list)
remaining_classes_dict = {key: value for key, value in classes_food251_lem_dict.items() if value in remaining_classes}
n_samples_per_remaining_class_train = round(np.mean(n_samples_train) / len(remaining_classes) + 0.5)
n_samples_per_remaining_class_test = round(np.mean(n_samples_test) / len(remaining_classes) + 0.5)
print(f"Number of remaining classes: {len(remaining_classes_dict)}")
print(f"Number of samples per remaining class for training: {n_samples_per_remaining_class_train}")
print(f"Number of samples per remaining class for training: {n_samples_per_remaining_class_test}")

Create a data frame that contains the image name, label, and class name of the remaning image dataset.

In [10]:
# Create a dataframe of the reamining classes  
df_remaining_classes = pd.DataFrame(remaining_classes_dict.items(), columns=['label', 'class'])
df_train_labels = pd.read_csv(train_labels_path)
df_val_labels = pd.read_csv(val_labels_path)

In [11]:
# Extend the dataframe with labels
df_remaining_train_labels = df_train_labels.merge(df_remaining_classes, how='right', on='label')
df_remaining_val_labels = df_val_labels.merge(df_remaining_classes, how='right', on='label')

In [None]:
# Explore the result
df_remaining_train_labels.head(10)

Copies a random selection of image files from the source directory to the destination directory based on labels provided in a DataFrame.

In [13]:
def copy_random_samples(df, source_dir, destination_dir, n_samples_per_class, seed):
    """
    Copy random samples from the source directory to the destination directory.

    Args:
        df (pandas.DataFrame): DataFrame containing the labels and image names.
        source_dir (str): Path to the source directory.
        destination_dir (str): Path to the destination directory.
        n_samples_per_class (int): Number of samples to copy per class.
        seed (int): Random seed for reproducibility.

    Returns:
        None
    """
    # Ensure the destination directory exists
    os.makedirs(destination_dir, exist_ok=True)

    # Loop over each label
    for _, group in df.groupby('label'):
        # Sample the group and take the image names 
        selected_files = group.sample(n=n_samples_per_class, random_state=seed)['img_name'].tolist()
        # Copy the selected files into the destination directory
        for file in selected_files:
            source_path = os.path.join(source_dir, file)
            destination_path = os.path.join(destination_dir, file)
            shutil.copy(source_path, destination_path)

copy_random_samples(df_remaining_train_labels, train_dir_food251, train_dir_food101_unknown, n_samples_per_remaining_class_train, SEED)
copy_random_samples(df_remaining_val_labels, val_dir_food251, test_dir_food101_unknown, n_samples_per_remaining_class_test, SEED)

# 3. Extented the Unknow Class with New Images

In [14]:
# Download other typical food types to be added to the unknown category
other_images = ['capuccino cup', 'coffee', 'banana', 'obst', 'apfel', 'orange fruit', 'fruit basket', 'smoothie', 'dorade', 'kabeljau']
#for item in other_images:
#    downloader.download(item, limit=25, output_dir='images', adult_filter_off=True, force_replace=False, timeout=60, filter="photo, clipart", verbose=True)

In [None]:
# Create "train" and "test" directories if they don't exist
other_image_folder = Path('images')
other_image_folder_train = other_image_folder / 'train'
other_image_folder_test = other_image_folder / 'test'
other_image_folder_train.mkdir(parents=True, exist_ok=True)
other_image_folder_test.mkdir(parents=True, exist_ok=True)  

# Loop through each category in "other_images"
for category in other_images:
    # List the files for this category (assumes the images are named according to the category)
    category_folder = os.path.join(other_image_folder, category)
    
    # Check if the category folder exists
    if os.path.exists(category_folder):
        # Get all image filenames for this category
        images_orig = [img for img in os.listdir(category_folder) if img.lower().endswith(('.jpg', '.jpeg'))]
        images_renamed = [image.replace('.',f'_{category}.') for image in images_orig]
        images_renamed = [image.replace(' ','_') for image in images_renamed]

        # Shuffle the image filenames randomly
        random.seed(SEED+random.randint(1, 1000))
        random.shuffle(images_orig)
        random.shuffle(images_renamed)


        # Move 6 images to the "train" folder
        for idx, img in enumerate(images_orig[:6]):
            src_orig = os.path.join(category_folder, img)
            dst_orig = os.path.join(train_dir_food101_unknown, images_renamed[idx])
            dst_renamed = os.path.join(train_dir_food101_unknown, images_renamed[idx])
            shutil.copy(src_orig, dst_orig)
            shutil.move(dst_orig, dst_renamed)

        # Move 2 images to the "test" folder
        for idx, img in enumerate(images_orig[6:8]):
            src_orig = os.path.join(category_folder, img)
            dst_orig = os.path.join(test_dir_food101_unknown, images_renamed[idx])
            dst_renamed = os.path.join(test_dir_food101_unknown, images_renamed[idx])
            shutil.copy(src_orig, dst_orig)
            shutil.move(dst_orig, dst_renamed)

        print(f"Moved 6 images of {category} to 'train' and 2 images to 'test'.")
    else:
        print(f"Category folder '{category}' not found. Skipping...")