# Data Creation from Food101 Database

This notebook creates a PyTorch dataset with differnt types of food. PyTorch now incorporates Food101.

Three classes: Pizza, Steak, Sushi.

The target dataset folder structure is as follows:

```
pizza_steak_sushi/
    train/
        pizza/
            image01.jpeg
            image02.jpeg
            ...
        steak/
            image04.jpeg
            image05.jpeg
            ...
        sushi/
            image07.jpeg
            ...
    test/
        pizza/
            image101.jpeg
            image102.jpeg
            ...
        steak/
            image104.jpeg
            image105.jpeg
            ...
        sushi/
            image107.jpeg
            ...
```

In [None]:
import torchvision.datasets as datasets

## Download data

Get the Food101 dataset from PyTorch.
* Food101 in `torchvision.datasets` - https://pytorch.org/vision/stable/generated/torchvision.datasets.Food101.html
* Original Food101 dataset - https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/ 

In [None]:
# Setup data directory
from pathlib import Path
data_dir = Path("../data")

# Get training data
train_data = datasets.Food101(root=data_dir,
                              split="train",
                              # transform=transforms.ToTensor(),
                              download=True)

# Get testing data
test_data = datasets.Food101(root=data_dir,
                             split="test",
                             # transform=transforms.ToTensor(),
                             download=True)

In [None]:
train_data

In [None]:
class_names = train_data.classes
class_names

## Extract the Subset of Target Classes

A list of the different target image classes (`pizza`, `steak`, `sushi`) filenames will be created and copied to separate folders.

In [None]:
# Get 100% of training images
import random

# Setup data paths
data_path = data_dir / "food-101" / "images"
target_classes = class_names #["pizza", "steak", "sushi"]

amount_to_get = 1.0

# Create function to separate a random amount of data
def get_subset(image_path=data_path,
               data_splits=["train", "test"], 
               target_classes=["pizza", "steak", "sushi", "hamburger", "apple_pie"],
               amount=0.1,
               seed=42):
    random.seed(42)
    label_splits = {}
    
    # Get labels
    for data_split in data_splits:
        print(f"[INFO] Creating image split for: {data_split}...")
        label_path = data_dir / "food-101" / "meta" / f"{data_split}.txt"
        with open(label_path, "r") as f:
            labels = [line.strip("\n") for line in f.readlines() if line.split("/")[0] in target_classes] 
        
        # Get random subset of target classes image ID's
        number_to_sample = round(amount * len(labels))
        print(f"[INFO] Getting random subset of {number_to_sample} images for {data_split}...")
        sampled_images = random.sample(labels, k=number_to_sample)
        
        # Apply full paths
        image_paths = [Path(str(image_path / sample_image) + ".jpg") for sample_image in sampled_images]
        label_splits[data_split] = image_paths
    return label_splits
        
label_splits = get_subset(
    image_path=data_path,
    data_splits=["train", "test"],
    target_classes=target_classes,
    amount=amount_to_get)
label_splits["train"][:10]

## Move training and testing images to dedicated folders

In [None]:
# Create target directory path
#target_dir_name = f"../data/pizza_steak_sushi_applepie_hamburger_{str(int(amount_to_get*100))}_percent"
target_dir_name = f"../data/food-101_{str(int(amount_to_get*100))}_percent"
print(f"Creating directory: '{target_dir_name}'")

# Setup the directories
target_dir = Path(target_dir_name)

# Make the directories
target_dir.mkdir(parents=True, exist_ok=True)

In [None]:
import shutil

for image_split in label_splits.keys():
    for image_path in label_splits[str(image_split)]:
        dest_dir = target_dir / image_split / image_path.parent.stem / image_path.name
        if not dest_dir.parent.is_dir():
            dest_dir.parent.mkdir(parents=True, exist_ok=True)
        print(f"[INFO] Copying {image_path} to {dest_dir}...")
        shutil.copy2(image_path, dest_dir)

In [None]:
# Check lengths of directories
def walk_through_dir(dir_path):
  """
  Walks through dir_path returning its contents.
  Args:
    dir_path (str): target directory
  
  Returns:
    A print out of:
      number of subdiretories in dir_path
      number of images (files) in each subdirectory
      name of each subdirectory
  """
  import os
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")
    
walk_through_dir(target_dir)

There is a total of 750 images per class for training and 250 images for testing.

In [None]:
!pip install imagenet-downloader pandas requests

In [None]:
import random
import os
import requests
from imagenet_downloader import download

# Set your ImageNet credentials
IMAGENET_USERNAME = "your_username"
IMAGENET_ACCESS_TOKEN = "your_access_token"

# Path to save images
DOWNLOAD_DIR = "./imagenet_unknown"

# Function to download images for a given synset
def download_images(synset_id, num_images, split, output_dir):
    target_dir = os.path.join(output_dir, split, synset_id)
    os.makedirs(target_dir, exist_ok=True)
    print(f"Downloading {num_images} images for {synset_id} into {target_dir}...")
    download(
        username=IMAGENET_USERNAME,
        accesskey=IMAGENET_ACCESS_TOKEN,
        wnid=synset_id,
        output_dir=target_dir,
        num_images=num_images
    )
    print(f"Completed downloading for {synset_id}.")

# Define random synsets (non-food related)
synsets = [
    "n01440764",  # Tench
    "n01530575",  # Brambling
    "n01629819",  # European fire salamander
    "n01770393",  # Scorpion
    "n01843383",  # Toucan
    "n02102040",  # English springer
    "n02951358",  # Canoe
    "n03272010",  # Electric guitar
    "n03876231",  # Paintbrush
    "n04591713",  # Wine bottle
]

# Randomly select 5 synsets for training and 5 for testing
random.shuffle(synsets)
train_synsets = synsets[:5]
test_synsets = synsets[5:]

# Download training images
for synset in train_synsets:
    download_images(synset_id=synset, num_images=100, split="train", output_dir=DOWNLOAD_DIR)

# Download test images
for synset in test_synsets:
    download_images(synset_id=synset, num_images=50, split="test", output_dir=DOWNLOAD_DIR)

print("Download completed!")
