## Custom Data Creation (From Food101)


In [2]:
import torch
import torchvision

import torchvision.datasets as datasets
import torchvision.transforms as transforms

# Setup data directory
import pathlib
data_dir = pathlib.Path('../data')

print(f'{torch.__version__}')
print(f'{torchvision.__version__}')

2.8.0
0.23.0


## Download Data

- Food101 in torchvision.datasets - https://pytorch.org/vision/stable/generated/torchvision.datasets.Food101.html

- Original Food101 dataset - https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/

In [3]:
# Get training data
train_data = datasets.Food101(root = data_dir,
                              split = 'train',
                              transform = transforms.ToTensor(),
                              download = True)

test_data = datasets.Food101(root = data_dir,
                             split = 'test',
                            transform = transforms.ToTensor(),
                            download = True)

train_data, test_data

100%|██████████| 5.00G/5.00G [06:00<00:00, 13.9MB/s]   


(Dataset Food101
     Number of datapoints: 75750
     Root location: ../data
     split=train
     StandardTransform
 Transform: ToTensor(),
 Dataset Food101
     Number of datapoints: 25250
     Root location: ../data
     split=test
     StandardTransform
 Transform: ToTensor())

In [6]:
class_names = train_data.classes
len(class_names)

101

In [7]:
# view first sample (PIL Image format)
print(class_names[train_data[0][1]])
train_data[0][0]

churros


tensor([[[0.9725, 0.9686, 0.9608,  ..., 0.3216, 0.3490, 0.3686],
         [0.9725, 0.9686, 0.9608,  ..., 0.3412, 0.3608, 0.3725],
         [0.9765, 0.9686, 0.9647,  ..., 0.3647, 0.3765, 0.3804],
         ...,
         [0.5961, 0.5922, 0.5882,  ..., 0.5804, 0.5922, 0.6078],
         [0.5882, 0.5843, 0.5765,  ..., 0.5843, 0.5922, 0.6039],
         [0.5843, 0.5765, 0.5647,  ..., 0.5922, 0.5961, 0.6039]],

        [[0.9569, 0.9529, 0.9451,  ..., 0.0941, 0.1098, 0.1294],
         [0.9569, 0.9529, 0.9451,  ..., 0.1137, 0.1216, 0.1333],
         [0.9608, 0.9529, 0.9490,  ..., 0.1373, 0.1373, 0.1412],
         ...,
         [0.2980, 0.2941, 0.2902,  ..., 0.5451, 0.5647, 0.5804],
         [0.2902, 0.2863, 0.2784,  ..., 0.5490, 0.5647, 0.5765],
         [0.2863, 0.2784, 0.2667,  ..., 0.5569, 0.5686, 0.5765]],

        [[0.9608, 0.9569, 0.9490,  ..., 0.0157, 0.0275, 0.0471],
         [0.9608, 0.9569, 0.9490,  ..., 0.0353, 0.0471, 0.0510],
         [0.9647, 0.9569, 0.9529,  ..., 0.0588, 0.0627, 0.

### Find subset of appropriate classes

current path: `../data/food-101/images/CLASS_NAME/IMAGES.jpg`

Going to get a list of the different target image classes (`pizza`, `steak`, `sushi`) filenames and then copy the images to separate folders.

In [10]:
# get random 10% of training images
import random

# Setup data paths
data_path = data_dir / "food-101" / "images"
target_classes = ['pizza', 'steak', 'shushi']

# Change amount of data to get (eg: 0.1 -> random 10% | 0.2 -> random 20%)
amount_to_get = 0.2

# Create function to separate a random amount of data
def get_subset(image_path = data_path,
               data_splits = ['train', 'test'],
               target_classes = ['pizza', 'steak', 'sushi'],
               amount = 0.1,
               seed = 42):
    random.seed(42)
    label_splits = {}
    
    # Get labels
    for data_split in data_splits:
        print(f'[INFO] Creating image split for: {data_split}...')
        label_path = data_dir / 'food-101' / 'meta' / f'{data_split}.txt'
        with open(label_path, 'r') as f:
            labels = [line.strip('\n') for line in f.readlines() if line.split('/')[0] in target_classes] 
            
        # Get random subset of target classes image IDs
        number_to_sample = round(amount * len(labels))
        print(f'[INFO] Getting random subset of {number_to_sample} images for {data_split}...')
        sampled_images = random.sample(labels, k=number_to_sample)
        
        # Apply full paths
        image_paths = [pathlib.Path(str(image_path / sample_image) + '.jpg') for sample_image in sampled_images]
        label_splits[data_split] = image_paths
        
    return label_splits

label_splits = get_subset(amount = amount_to_get)
label_splits['train'][:10]

[INFO] Creating image split for: train...
[INFO] Getting random subset of 450 images for train...
[INFO] Creating image split for: test...
[INFO] Getting random subset of 150 images for test...


[PosixPath('../data/food-101/images/pizza/3269634.jpg'),
 PosixPath('../data/food-101/images/pizza/1524655.jpg'),
 PosixPath('../data/food-101/images/steak/2825100.jpg'),
 PosixPath('../data/food-101/images/steak/225990.jpg'),
 PosixPath('../data/food-101/images/steak/1839481.jpg'),
 PosixPath('../data/food-101/images/pizza/38349.jpg'),
 PosixPath('../data/food-101/images/pizza/3018077.jpg'),
 PosixPath('../data/food-101/images/sushi/93139.jpg'),
 PosixPath('../data/food-101/images/pizza/2702825.jpg'),
 PosixPath('../data/food-101/images/sushi/200025.jpg')]

### Move training and testing images to dedicated folders


In [11]:
# Create target directory path
target_dir_name = f'../data/pizza_steak_sushi_{str(int(amount_to_get * 100))}_percent'
print(f"Creating directory: '{target_dir_name}'")

# Setup the directories
target_dir = pathlib.Path(target_dir_name)

# Make the directories
target_dir.mkdir(parents=True, exist_ok=True)


Creating directory: '../data/pizza_steak_sushi_20_percent'


In [13]:
import shutil

for image_split in label_splits.keys():
    for image_path in label_splits[str(image_split)]:
        dest_dir = target_dir / image_split / image_path.parent.stem / image_path.name
        
        
        if not dest_dir.parent.is_dir():
            dest_dir.parent.mkdir(parents=True, exist_ok=True)    
        
        print(f"[INFO] Copying {image_path} to {dest_dir}...")
        shutil.copy2(image_path, dest_dir)

[INFO] Copying ../data/food-101/images/pizza/3269634.jpg to ../data/pizza_steak_sushi_20_percent/train/pizza/3269634.jpg...
[INFO] Copying ../data/food-101/images/pizza/1524655.jpg to ../data/pizza_steak_sushi_20_percent/train/pizza/1524655.jpg...
[INFO] Copying ../data/food-101/images/steak/2825100.jpg to ../data/pizza_steak_sushi_20_percent/train/steak/2825100.jpg...
[INFO] Copying ../data/food-101/images/steak/225990.jpg to ../data/pizza_steak_sushi_20_percent/train/steak/225990.jpg...
[INFO] Copying ../data/food-101/images/steak/1839481.jpg to ../data/pizza_steak_sushi_20_percent/train/steak/1839481.jpg...
[INFO] Copying ../data/food-101/images/pizza/38349.jpg to ../data/pizza_steak_sushi_20_percent/train/pizza/38349.jpg...
[INFO] Copying ../data/food-101/images/pizza/3018077.jpg to ../data/pizza_steak_sushi_20_percent/train/pizza/3018077.jpg...
[INFO] Copying ../data/food-101/images/sushi/93139.jpg to ../data/pizza_steak_sushi_20_percent/train/sushi/93139.jpg...
[INFO] Copying ../

In [15]:
# Check lengths of the directories
def walk_through_dir(dir_path):
    """
    Walks through dir_path returning its contents
    Args:
        dir_path (str) : target directory
    Returns:
        A print out of :
            - number of subdirectories in dir_path
            - number of images (files) in each subdirectory
            - name of each subdirectory
    """
    import os
    for dirpath, dirnames, filenames in os.walk(dir_path):
        print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'")

walk_through_dir(target_dir)
    

There are 2 directories and 0 images in '../data/pizza_steak_sushi_20_percent'
There are 3 directories and 0 images in '../data/pizza_steak_sushi_20_percent/test'
There are 0 directories and 58 images in '../data/pizza_steak_sushi_20_percent/test/steak'
There are 0 directories and 46 images in '../data/pizza_steak_sushi_20_percent/test/sushi'
There are 0 directories and 46 images in '../data/pizza_steak_sushi_20_percent/test/pizza'
There are 3 directories and 0 images in '../data/pizza_steak_sushi_20_percent/train'
There are 0 directories and 146 images in '../data/pizza_steak_sushi_20_percent/train/steak'
There are 0 directories and 150 images in '../data/pizza_steak_sushi_20_percent/train/sushi'
There are 0 directories and 154 images in '../data/pizza_steak_sushi_20_percent/train/pizza'


In [16]:
# Zip up images folder to be more easily transported

# Zip. pizza, steak and shushi images
zip_file_name = data_dir / f"pizza_steak_shushi_{str(int(amount_to_get * 100))}_percent"

shutil.make_archive(zip_file_name, format='zip', root_dir=target_dir)

'/Users/qbit-glitch/Desktop/coding-projects/pytorch_tutorials_from_official_docs/ztm_pytorch_course/data/pizza_steak_shushi_20_percent.zip'

In [17]:
!ls -la ../data/

total 9819872
drwxr-xr-x  6 qbit-glitch  staff         192 10 Nov 09:45 [34m.[m[m
drwxr-xr-x  8 qbit-glitch  staff         256  8 Nov 11:13 [34m..[m[m
drwxr-xr-x  6 qbit-glitch  staff         192  9 Jul  2014 [34mfood-101[m[m
-rw-r--r--  1 qbit-glitch  staff  4996278331  8 Nov 11:19 food-101.tar.gz
-rw-r--r--  1 qbit-glitch  staff    31491084 10 Nov 09:45 pizza_steak_shushi_20_percent.zip
drwxr-xr-x  4 qbit-glitch  staff         128 10 Nov 09:38 [34mpizza_steak_sushi_20_percent[m[m


In [None]:
!mkdir -p pizza_steak_shushi
!unzip ../dr