### Data preparation

This code creates a directory holding an organised subsample of the original dataset.

In [36]:
import os, shutil

In [37]:
# Path to the full data directory, not categorised into train/val/test sets or category folders
original_dataset_dir = '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data'

# The directory where we will store our smaller dataset, divided into train/val/test directories, and further into category directories 
base_dir = '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data_small'

In [38]:
print('Total number of images:', len(os.listdir(original_dataset_dir)))

Total number of images: 25000


In [39]:
# We want to keep our data organized into train, test, validation folders, each with separate category subfolders
categories = ['cats', 'dogs']
str_train_val_test = ['train', 'validation', 'test']

if not os.path.exists(base_dir):
    os.mkdir(base_dir)
    print('Created directory: ', base_dir)

for dir_type in str_train_val_test:
    train_test_val_dir = os.path.join(base_dir, dir_type)

    if not os.path.exists(train_test_val_dir):
        os.mkdir(train_test_val_dir)

    for category in categories:
        dir_type_category = os.path.join(train_test_val_dir, category)

        if not os.path.exists(dir_type_category):
            os.mkdir(dir_type_category)
            print('Created directory: ', dir_type_category)

In [40]:
directories_dict = {}  # To store directory paths for data subsets.

for range_, dir_type in zip(
    [range(1000), range(1000, 1500),
     range(1500, 2000)], str_train_val_test):
    for _, category in enumerate(categories):
        fnames = [category[:-1] + "." + str(i) + "." + "jpg" for i in range_]

        if _ == 0:
            directories_dict[str(dir_type) + "_dir"] = os.path.join(
                base_dir,
                dir_type,
            )
        for _, fname in enumerate(fnames):

            if _ == 0:
                directories_dict[str(dir_type) + "_" + str(category) +
                                 "_dir"] = os.path.join(
                                     base_dir, dir_type, category)

            destination = os.path.join(base_dir, dir_type, category, fname)

            if not os.path.isfile(destination):
                source = os.path.join(original_dataset_dir, fname)
                shutil.copyfile(source, destination)

In [41]:
directories_dict

{'test_cats_dir': '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data_small/test/cats',
 'test_dir': '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data_small/test',
 'test_dogs_dir': '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data_small/test/dogs',
 'train_cats_dir': '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data_small/train/cats',
 'train_dir': '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data_small/train',
 'train_dogs_dir': '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data_small/train/dogs',
 'validation_cats_dir': '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data_small/validation/cats',
 'validation_dir': '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data_small/validation',
 'validation_dogs_dir': '/mnt/ml-team/homes/rafal.jakubanis/blogpost2-resnet50/data_small/validation/dogs'}

In [42]:
print('Total training cat images:', len(os.listdir(directories_dict['train_cats_dir'])))
print('Total training dog images:', len(os.listdir(directories_dict['train_cats_dir'])))
print("-"*32)
print('Total test cat images:', len(os.listdir(directories_dict['test_cats_dir'])))
print('Total test dog images:', len(os.listdir(directories_dict['test_dogs_dir'])))
print("-"*32)
print('Total validation cat images:', len(os.listdir(directories_dict['validation_cats_dir'])))
print('Total validation dog images:', len(os.listdir(directories_dict['validation_dogs_dir'])))


Total training cat images: 1000
Total training dog images: 1000
--------------------------------
Total test cat images: 500
Total test dog images: 500
--------------------------------
Total validation cat images: 500
Total validation dog images: 500
