In [1]:
import numpy as np
import shutil
import os.path as osp

In [2]:
# this is an "import" in .py files
%run utils.ipynb
%run DatasetManager.ipynb

In [3]:
class ValidationManager():
    """
    Defines the way the files are splitted into train and test groups.
    """
    def __init__(self, train_size = 0.9, test_size = 0.1, shuffle = True, random_state = None):
        """
        Description:
        Builds the ValidationManager.
        
        Parameters:
        train_size (float, 0.9): the number of images in the train set.
        test_size (float, 0.1): the number of images in the test set.
        shuffle (boolean, True): if the data must be splitted.
        random_state (int, None): random seed for results reproducibility.
        
        Returns:
        vm (ValidationManager): the built ValidationManager.
        """
        self.train_size_ = train_size
        self.test_size_ = test_size
        self.shuffle_ = shuffle
        self.random_state_ = random_state
    
    def __normalize__(*values):
        """
        Description:
        Normalizes a list of values.
        
        Parameters:
        values (float, float, ..., float): the values to normalize.
        
        Returns:
        values (list[float]): a list with the normalized values.
        """
        total = sum(values)
        if total != 1:
            values = [value / total for value in values]
        
        return values
    
    def __get_indexes__(total_len, *splits):
        """
        Description:
        Gets the index of each split.
        
        Parameters:
        total_len (int): the total len of the data.
        splits (float, float, ..., float): the data percentage for each split.
        
        Returns:
        indexes (list(int)): a list with the final index for each split.
        """
        except_last = [int(total_len * sum(splits[:index+1])) for index, split in enumerate(splits[:-1])]
        return except_last + [total_len]
    
    def __get_slices__(*indexes):
        """
        Description:
        Gets the slices making intevals between indexes.
        
        Parameters:
        indexes (int, int, ..., int): the indexes to make the union.
        
        Returns:
        slices (list[slice]): a list with all the slices.
        """
        
        return [slice(start, end) for start, end in zip(indexes[:-1], indexes[1:])]
    
    def __make_validation_structure__(splits_dir, replications = 1):
        """
        Description:
        Makes the structure for the validation.
        
        Parameters:
        splits_dir (str): path like. Where the splits are going to be done.
        replications (int): the number of training groups.
        
        Returns:
        None.
        """
        # creates an empty split root directory
        if osp.isdir(splits_dir):
            shutil.rmtree(splits_dir)
        
        os.mkdir(splits_dir)
        
        # creates the new splits directories, one for each replication
        for index in range(1, replications + 1):
            os.mkdir(osp.join(splits_dir, f"f_{index}"))
            
    def __get_splits_dict__(splits_names):
        """
        Description:
        Gets the structure's dict of the splits directory.
        
        Parameters:
        splits_names (list[list[str]]): the list with the splits directories.
        
        Returns:
        d (dict): the structure's dict.
        """
        results = {}
        for index, names in enumerate(splits_names):
            if len(names) == 2:
                results.update({f"f_{index+1}": {"train": names[0], "val": names[1]}})
            else:
                results.update({"test": names[0]})
        
        return results

    @AOP.excepter(NotImplementedError)
    def split(self, dataset):
        """
        Description:
        Makes the splits.
        
        Parameters:
        dataset (DatasetManager): the dataset where the data is.

        Returns:
        None.
        """
        raise NotImplementedError("You can not use this abstract class to make the split.")

In [4]:
class ValidationManagerTrainValTest(ValidationManager):
    """
    Defines the way the files are splitted into train and validation groups.
    """
    def __init__(self, train_size = 0.75, val_size = 0.15, test_size = 0.1,
                 shuffle = True, random_state = None):
        """
        Description:
        Builts the ValidationManagerTrainValTest.
        
        Parameters:
        train_size (int | Float, 0.75): the number of images in the train set.
        val_size (int | Float, 0.15): the number of images in the validation set.
        test_size (int | Float, 0.1): the number of images in the test set.
        shuffle (boolean, True): if the data must be splitted.
        random_state (int, None): random seed for results reproducibility.
        
        Returns:
        vm (ValidationManagerTrainValTest): The built ValidationManagerTrainValTest.
        """
        super().__init__(train_size, test_size, shuffle, random_state)
        self.val_size_ = val_size

    def split(self, dataset):
        """
        Description:
        Makes 3 splits in the dataset: train, validation and test.
        
        Parameters:
        dataset (DatasetManager): the dataset where the data is.
        
        Returns:
        split_result (dict): the split structure.
        """
        # normalizes the sizes
        normalized_sizes = ValidationManager.__normalize__(self.train_size_, self.val_size_, self.test_size_)
        self.train_size_, self.val_size_, self.test_size_ = normalized_sizes

        # gets the real number of images in each split
        ds_images = dataset.get_images()
        train_index, val_index, test_index = ValidationManager.__get_indexes__(len(ds_images), *normalized_sizes)
        
        # shuffles it if requested
        if self.shuffle_:
            if self.random_state_:
                np.random.seed(self.random_state_)
            np.random.shuffle(ds_images)

        # saves the split result
        splits_dir = osp.join(dataset.root_dir_, "splits")
        splits_names = ["f_1/train.txt", "f_1/val.txt", "test.txt"]
        splits_slices = ValidationManager.__get_slices__(0, train_index, val_index, test_index)

        # creates the splits structure
        ValidationManager.__make_validation_structure__(splits_dir, 1)

        # saves all the splits: for each filename
        for file_name, index in zip(splits_names, splits_slices):
            # opens it
            with open(osp.join(splits_dir, file_name), "w") as file:
                # saves all the images path
                for img in ds_images[index]:
                    file.write(osp.basename(img).split(".")[0] + "\n")

        # returns a dictionary with the structure of the split directory
        return ValidationManager.__get_splits_dict__([[*splits_names[:-1]], [splits_names[-1]]])

In [5]:
class ValidationManagerKFold(ValidationManager):
    """
    Defines the way the files are splitted into k groups of train and validation sets.
    """
    
    def __init__(self, train_size = 0.9, test_size = 0.1, n_splits = 5,
                 shuffle = True, random_state = None):
        """
        Description:
        Builts the ValidationManagerTrainValTest.
        
        Parameters:
        train_size (int | Float, 0.9): the number of images in the train set.
        test_size (int | Float, 0.1): the number of images in the test set.
        n_splits (int, 5): the number of folds used in KFold validation.
        shuffle (boolean, True): if the data must be splitted.
        random_state (int, None): random seed for results reproducibility.
        
        Returns:
        vm (ValidationManagerKFold): The built ValidationManagerKFold.
        """
        super().__init__(train_size, test_size, shuffle, random_state)
        self.n_splits_ = n_splits if n_splits >= 2 else 5
 
    def split(self, dataset):
        """
        Description:
        Makes KFold validation splits.

        Parameters:
        dataset (DatasetManager): the dataset where the data is.
        
        Returns:
        split_result (dict): the split structure.
        """
        # normalizes the sizes
        normalized_sizes = ValidationManager.__normalize__(self.train_size_, self.test_size_)
        self.train_size_, self.test_size_ = normalized_sizes
        
        # gets the total number of images
        ds_images = dataset.get_images()

        # gets the split index for each fold
        train_index, test_index = ValidationManager.__get_indexes__(len(ds_images), *normalized_sizes)
        train_index_i = list(np.cumsum([int(train_index / self.n_splits_)] * self.n_splits_))

        # the last fold can be a little bigger (just like the test size)
        train_index_i[-1] = train_index
        
        # shuffles it if requested
        if self.shuffle_:
            if self.random_state_:
                np.random.seed(self.random_state_)
            np.random.shuffle(ds_images)
        
        # gets the splits directory name
        splits_dir = osp.join(dataset.root_dir_, "splits")
        
        # gets all the folds directories names
        splits_names = [[f"f_{index}/train.txt", f"f_{index}/val.txt"] for index in range(1, self.n_splits_ + 1)]
        splits_names.append(["test.txt"])
        
        # gets all the slices for each fold (f fold train - validation) except the test slice
        splits_slices = ValidationManager.__get_slices__(0, *train_index_i)
        
        # creates a list of list where each element is a list of train slices and a list with the validation slice
        # splits_slices = [[[train_slice_1, train_slice_2, ..., train_slice_n], [validation_slice]], ...]
        splits_slices = [[[s_ for s_ in splits_slices if s_ != s], [s]] for s in splits_slices]
        
        # appends the test slice to the result
        splits_slices.append([[slice(train_index_i[-1], test_index)]])
        
        # creates the splits structure
        ValidationManager.__make_validation_structure__(splits_dir, self.n_splits_)

        # saves all the splits
        for file_names, split_slices in zip(splits_names, splits_slices):
            for file_name, slices in zip(file_names, split_slices):
                with open(osp.join(splits_dir, file_name), "w") as file:
                    for s in slices:
                        for img in ds_images[s]:
                            file.write(osp.basename(img).split(".")[0] + "\n")

        # returns the names dict
        return ValidationManager.__get_splits_dict__(splits_names)