# Split Dataset

Used to create the three split sets: train, test and validation, dividing the classes equally for each split.

In [1]:
import os

import numpy as np

import torchvision.transforms as transforms
import utils

## Definition of the percentage of the data used for each split

In [2]:
train_size = 0.65
valid_size = 0.25
test_size  = 0.10

## Loading the data from the dataset folder

In [3]:
dataset_path = "dataset/"

loader, classes = utils.load_data(dataset_path + "all/")

# Creating a matrix with the images of each class separated

The lines of the array represent a class and have all the images from that class. Example below:

images_class_matrix[0] -> img1, img2, img3  
images_class_matrix[1] -> img1  
images_class_matrix[2] -> img1, img2  
.  
.  
images_class_matrix[6] -> img1, img2, img4, img6
  
   
Where the indexes represent the following classes:  
0 -> bicycle  
1 -> bus  
2 -> car  
.  
.  
6 -> van

In [4]:
%%time

images_class_matrix = []
train_images = []
test_images  = []
valid_images = []
for i in range(len(classes)):
    images_class_matrix.append([])
    train_images.append([])
    test_images.append([])
    valid_images.append([])

data_iterator = iter(loader)
    
for image, label in data_iterator:
    image, label = np.array(image).squeeze().transpose(1,2,0), np.array(label).squeeze()
    images_class_matrix[label].append(image)
images_class_matrix = np.array(images_class_matrix)



CPU times: user 1min 2s, sys: 5.26 s, total: 1min 7s
Wall time: 23.6 s


## Add the images to the splits sets using *images_class_matrix*

In [5]:
for class_index, class_images in enumerate(images_class_matrix):
    len_class_images = len(class_images)
    indices = list(range(len_class_images))
    split1 = int(np.floor(valid_size * len_class_images)) # end validation
    split2 = int(np.floor((valid_size * len_class_images) + (test_size * len_class_images))) # end test
    
    valid_idx, test_idx, train_idx = indices[:split1], indices[split1:split2], indices[split2:len_class_images]
    for index in train_idx:
        train_images[class_index].append(class_images[index])
    for index in valid_idx:
        valid_images[class_index].append(class_images[index])
    for index in test_idx:
        test_images[class_index].append(class_images[index])
train_images = np.array(train_images)
test_images = np.array(test_images)
valid_images = np.array(valid_images)

## Count how much images do we have in each split

In [6]:
counter_train = list(0 for i in range(len(train_images)))
counter_test = list(0 for i in range(len(test_images)))
counter_valid = list(0 for i in range(len(valid_images)))

for index in range(len(train_images)):
    counter_train[index] = len(train_images[index])
for index in range(len(test_images)):
    counter_test[index] = len(test_images[index])
for index in range(len(valid_images)):
    counter_valid[index] = len(valid_images[index])
    
counter_train = np.array(counter_train)
counter_valid = np.array(counter_valid)
counter_test = np.array(counter_test)
counter_total = counter_train.sum() + counter_valid.sum() + counter_test.sum()

print("%-4s images for training" % str(counter_train.sum()))
print("%-4s images for validation" % str(counter_valid.sum()))
print("%-4s images for test" % str(counter_test.sum()))
print("Total of %s images" % str(counter_total))

1086 images for training
414  images for validation
164  images for test
Total of 1664 images


## Save the images to their folders inside the dataset folder

In [7]:
splits = [train_images, valid_images, test_images]
split_names = ["train", "valid", "test"]

for split_index, split in enumerate(splits):
    directory = dataset_path + split_names[split_index] + "/"
    for class_index, class_images in enumerate(split):
        directory_class = directory + classes[class_index] + "/"
        utils.create_directory(directory_class)
        counter_class = 0
        for image in class_images:
            image_name = classes[class_index] + "_" + str(counter_class)
            image_path = directory_class + image_name

            utils.save_image(image_path, image)

            counter_class += 1