### 0. Make Imports and Set Up Device

In [11]:
from shutil import copy
from os import listdir, makedirs
from os.path import isdir, join, splitext

import torch
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision import transforms as tr

In [13]:
def setup_device() -> torch.device:
    if (not torch.cuda.is_available()):
        print("No CUDA GPUs found. CPU selected as training device.")
        return torch.device("cpu")
    
    device_id = 0
    device = torch.device(f"cuda:{device_id}")
    count = torch.cuda.device_count()
    name = torch.cuda.get_device_name(device_id)
    capability = torch.cuda.get_device_capability(device_id)
    print(f"{count} CUDA GPUs available. Using {name} with CUDA {capability[0]}.{capability[1]} capability.")
    return device


device = setup_device()

1 CUDA GPUs available. Using GeForce RTX 2080 SUPER with CUDA 7.5 capability.


### 1. Divide the Dataset into Categories
The used dataset doesn't contain any explicit category labels. However, the source images are sorted by category, with each of the 17 categories having exactly 80 images. Therefore we can label the images by simply counting them. The code assumes the raw images are placed into _data/jpg_ directory. It splits the data into training, validation and testing subsets, then creates a subdirectory for each subset in _data_ dir. In each of those, another set of subdirs is created -- one for each category, named after the category index (0 through 16) and flower images from that category are copied inside. The number of copied files for each subset is defined in the _subset_splits_ dict.

In [2]:
def is_file_jpg(file_path: str) -> bool:
    path_root, extension = splitext(file_path)
    return extension.lower() == ".jpg"

In [3]:
raw_data_path = "data/jpg"
category_count = 17
images_per_category = 80

# How many images in each category should fall into a data subset
subset_splits = {"training": 56, "validation": 16, "testing": 8}
assert sum(subset_splits.values()) == images_per_category

image_list = [file for file in listdir(raw_data_path) if is_file_jpg(file)]
assert len(image_list) == category_count * images_per_category
image_list[:10]

['image_0001.jpg',
 'image_0002.jpg',
 'image_0003.jpg',
 'image_0004.jpg',
 'image_0005.jpg',
 'image_0006.jpg',
 'image_0007.jpg',
 'image_0008.jpg',
 'image_0009.jpg',
 'image_0010.jpg']

In [4]:
def assign_images_to_categories(category_count: int, images_per_category: int, images: list) -> dict:
    categories = {}
    for category_index in range(category_count):
        first_image_in_category = category_index * images_per_category
        last_image_in_category = first_image_in_category + images_per_category
        categories[category_index] = images[first_image_in_category:last_image_in_category]
    return categories
        

category_dict = assign_images_to_categories(category_count, images_per_category, image_list)

assert len(category_dict.keys()) == category_count
assert len(category_dict[category_count - 1]) == images_per_category

In [5]:
def split_data_into_subsets(category_dict: dict, subset_splits: dict) -> (dict, dict, dict):
    train, valid, test = {}, {}, {}
    first_validation_image = subset_splits["training"]
    first_testing_image = first_validation_image + subset_splits["validation"]
    
    for cat_index, cat_images  in category_dict.items():
        train[cat_index] = cat_images[:first_validation_image]
        valid[cat_index] = cat_images[first_validation_image:first_testing_image]
        test[cat_index] = cat_images[first_testing_image:]
    
    return train, valid, test
    
    
training_images, validation_images, testing_images = split_data_into_subsets(category_dict, subset_splits)

assert len(training_images.keys()) == len(validation_images.keys()) == len(testing_images.keys()) == category_count
assert len(training_images[10]) == subset_splits["training"]
assert len(validation_images[4]) == subset_splits["validation"]
assert len(testing_images[16]) == subset_splits["testing"]

In [6]:
def create_directory(path: str) -> None:
    if not isdir(path):
        makedirs(path)
        

def create_subset_data_directories(subset_dict: dict, subset_name: str, raw_data_path: str) -> None:
    subset_dir_path = join("data", subset_name)
    create_directory(subset_dir_path)
    
    for category_index, category_images in subset_dict.items():
        category_path = join(subset_dir_path, str(category_index))
        create_directory(category_path)
        
        for image in category_images:
            source_path = join(raw_data_path, image)
            destination = join(category_path, image)
            copy(source_path, destination)
        

create_subset_data_directories(training_images, "training", raw_data_path)
create_subset_data_directories(validation_images, "validation", raw_data_path)
create_subset_data_directories(testing_images, "testing", raw_data_path)

### 2. Pre-process Data and Load into Data Loaders

In [23]:
'''
The network that will be used in transfer learning has been pre-trained using normalized data. Therefore the same
transformation must be performed for new data, for the training to be effective. Below are the values used for original
normalization.
'''
normalization_means = [0.485, 0.456, 0.406]
normalization_stds = [0.229, 0.224, 0.225]
final_image_size = 224

In [24]:
'''
These transformations should help the network to learn translation, rotation and size invariance, to reduce over-training. 
Additionally they normalize the input data to make it more statistically similar to the data that the network was 
pre-trained on.
'''
training_transforms = [tr.RandomRotation(degrees=10, expand=True),
                       tr.RandomResizedCrop(size=final_image_size, scale=[0.75, 1.0]),
                       tr.ToTensor(),
                       tr.Normalize(mean=normalization_means, std=normalization_stds)]

testing_transforms = [tr.Resize(size=final_image_size + 8),
                      tr.CenterCrop(size=final_image_size),
                      tr.ToTensor(),
                      tr.Normalize(mean=normalization_means, std=normalization_stds)]

In [29]:
batch_size = 64


def make_data_loader(data_path: str, transforms: list, batch_size: int) -> DataLoader:
    image_transformation = tr.Compose(transforms)
    data_set = ImageFolder(root=data_path, transform=image_transformation)
    should_pin_memory = torch.cuda.is_available()
    loader = DataLoader(dataset=data_set, shuffle=True, pin_memory=should_pin_memory, batch_size=batch_size)
    return loader
    
    
training_loader = make_data_loader("data/training", training_transforms, batch_size)
validation_loader = make_data_loader("data/validation", training_transforms, batch_size)
testing_loader = make_data_loader("data/testing", testing_transforms, batch_size)