# Data Creation from Food101 Database and SceneParse150

This notebook constructs a custom PyTorch dataset by combining food images from the Food-101 dataset (class `1`) and non-food images from the SceneParse150 dataset (class `0). The resulting dataset is designed for training a binary classification model that predicts whether a given image depicts food (1) or not (0).

# 1. Importing Libraries

In [1]:
import os
import shutil
import torch
import random
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

# Set up seed
SEED = 42
torch.manual_seed(42)

<torch._C.Generator at 0x22f4e290350>

# 2. Creating Folder Structure

In [2]:
# Read the dataset
ROOT_DATASET_DIR = "./"
# Define paths for training and test image directories
ORIG_IMG_DIR_0 = os.path.join("..","data","ADEChallengeData2016", "images")
ORIG_IMG_DIR_1 = os.path.join("..", "data", "food-101_100_percent")
ROOT_IMAGE_DIR =  os.path.join("..","data","classif_food_nofood_")
ROOT_TRAIN_DIR_0 = os.path.join(ROOT_IMAGE_DIR, "train", "0")
ROOT_TRAIN_DIR_1 = os.path.join(ROOT_IMAGE_DIR, "train", "1")
ROOT_TEST_DIR_0 = os.path.join(ROOT_IMAGE_DIR, "test", "0")
ROOT_TEST_DIR_1 = os.path.join(ROOT_IMAGE_DIR, "test", "1")

# Create directories if not already present
os.makedirs(ROOT_TRAIN_DIR_0, exist_ok=True)
os.makedirs(ROOT_TRAIN_DIR_1, exist_ok=True)
os.makedirs(ROOT_TEST_DIR_0, exist_ok=True)
os.makedirs(ROOT_TEST_DIR_1, exist_ok=True)

TRAIN_TEST_SPLIT = 0.25

# Put all images into a unique folder
CLASS0_TRAIN_DIR = Path(ORIG_IMG_DIR_0) / "training"
CLASS0_TEST_DIR =   Path(ORIG_IMG_DIR_0) / "validation"

CLASS1_TRAIN_DIR = Path(ORIG_IMG_DIR_1) / "train"
CLASS1_TEST_DIR =   Path(ORIG_IMG_DIR_1) / "test"

# Create a list of the training and test images for class 0
paths_train_0 = list(CLASS0_TRAIN_DIR.glob("*"))
paths_test_0 = list(CLASS0_TEST_DIR.glob("*"))

# Create a list of the training and test images for class 1
paths_train_1 = list(CLASS1_TRAIN_DIR.glob("*/*"))
paths_test_1 = list(CLASS1_TEST_DIR.glob("*/*"))

# 3. Generating Class 0 (No Food)

In [3]:
# Perform the traint-test split operation for class0 (no food)
df_train = pd.DataFrame(paths_train_0, columns=["path"])
df_test = pd.DataFrame(paths_test_0, columns=["path"])
df_train_train, df_training_test = train_test_split(df_train, test_size=TRAIN_TEST_SPLIT, random_state=SEED)
df_test_train, df_test_test = train_test_split(df_test, test_size=TRAIN_TEST_SPLIT, random_state=SEED)
df_train = pd.concat([df_train_train, df_test_train])
df_test = pd.concat([df_training_test, df_test_test])

def copy_images(df, target_dir):
    """
    Copies images from their source paths to a target directory.

    Args:
        df (pd.DataFrame): DataFrame containing the paths of images to copy.
        target_dir (str): Directory where images will be copied.
    """
    os.makedirs(target_dir, exist_ok=True)  # Ensure the target directory exists

    for src_path in tqdm(df["path"], desc=f"Copying images to {target_dir}"):
        if os.path.isfile(src_path):
            dst_path = os.path.join(target_dir, os.path.basename(src_path))
            shutil.copy2(src_path, dst_path)  # Preserve metadata while copying
        else:
            print(f"[WARNING] File not found: {src_path}")

# Copy images for train and test
copy_images(df_train, ROOT_TRAIN_DIR_0)
copy_images(df_test, ROOT_TEST_DIR_0)

Copying images to ..\data\classif_food_nofood_\train\0:   0%|          | 0/16657 [00:00<?, ?it/s]

Copying images to ..\data\classif_food_nofood_\test\0:   0%|          | 0/5553 [00:00<?, ?it/s]

# 4. Generating Class 1 (Food)

In [4]:
# Subsample class 1 (food) to make the dataset completely balanced
num_train_samples_0 = df_train.shape[0]
num_test_samples_0 = df_test.shape[0]

paths_train_1 = random.sample(paths_train_1, num_train_samples_0)
paths_test_1 = random.sample(paths_test_1, num_test_samples_0)

In [5]:
print(f"Length train samples for class 0 (no food): {num_train_samples_0}")
print(f"Length train samples for class 1 (food): {len(paths_train_1)}")
print(f"Length test samples for class 0 (no food): {num_test_samples_0}")
print(f"Length test samples for class 1 (food): {len(paths_test_1)}")

Length train samples for class 0 (no food): 16657
Length train samples for class 1 (food): 16657
Length test samples for class 0 (no food): 5553
Length test samples for class 1 (food): 5553


In [6]:
# Copy images for train and test
df_train = pd.DataFrame(paths_train_1, columns=["path"])
df_test = pd.DataFrame(paths_test_1, columns=["path"])
copy_images(df_train, ROOT_TRAIN_DIR_1)
copy_images(df_test, ROOT_TEST_DIR_1)

Copying images to ..\data\classif_food_nofood_\train\1:   0%|          | 0/16657 [00:00<?, ?it/s]

Copying images to ..\data\classif_food_nofood_\test\1:   0%|          | 0/5553 [00:00<?, ?it/s]