In [38]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import os
from glob import glob
import shutil
import random

In [52]:
# Dataset structure is
# /test/test
# /train/c0/ to c9/ for each class
# We need to make a validation set out of this.
path = './'

### Helper routines

In [50]:
def move(files, to):
    '''Move files to a dir
    Args: 
        files: List of file fullpaths
        to: Fullpath to destination folder
    '''
    for filepath in files:
        shutil.move(filepath, to)


def copy(files, to):
    '''Copy files to a dir
    Args: 
        files: List of file fullpaths
        to: Fullpath to destination folder
    '''
    for filepath in files:
        shutil.copy(filepath, to)


def get_files(origin, count=None, randomize=False):
    ''' Get a list of files from a directory.
    Args:
        origin: Fullpath to the dir containing the files.
        count: How many files to retrieve. 
            If be less than 1, represents % of all files in dir.
        random: Whether to retrieve files in a random order.
    Returns:
        A list of the filenames in folder.
    '''
    all_files = [filepath for filepath in os.listdir(origin)]
    if randomize:
        random.shuffle(all_files)
    if count is None:
        # Return all files
        return all_files
    elif type(count) == int:
        # Get this many files only
        return all_files[:count]
    elif type(count) == float and count > 0 and count < 1:
        # Return a percentage of all files
        num_files = len(all_files)
        num_files_to_return = int(round(num_files * count))
        return all_files[:num_files_to_return]


def transfer_files(origin, destination, copy_only=True, randomize=False, count=None):
    if not os.path.isdir(origin):
        raise ValueError("{} is not a directory.".format(origin))
    if not os.path.isdir(destination):
        os.makedirs(destination)
    # Get files in origin
    files = get_files(origin, randomize=randomize, count=count)
    # Make list of paths, not names of files
    files_path = [os.path.join(origin, filename) for filename in files]
    if copy_only:
        copy(files_path, destination)
    else:
        move(files_path, destination)

### Organize the dataset

In [51]:
# Get all class folders
class_paths = glob(os.path.join(path, 'train/*'))

for class_path in class_paths:
    class_name = os.path.basename(class_path)
    # Create a sample train set
    sample_train_destination = os.path.join(path, 'sample/train', class_name)
    transfer_files(origin=class_path,
                   destination=sample_train_destination,
                   copy_only=True,
                   randomize=True,
                   count=110)

    # Create a sample validation set
    # Use the sample train destination to get the sample validation set.
    sample_valid_destination = os.path.join(path, 'sample/valid', class_name)
    transfer_files(origin=sample_train_destination,
                   destination=sample_valid_destination,
                   copy_only=False,
                   randomize=True,
                   count=10)

    # Create a validation set
    valid_destination = os.path.join(path, 'valid', class_name)
    transfer_files(origin=class_path,
                   destination=valid_destination,
                   copy_only=False,
                   randomize=True,
                   count=0.1)