**DATASET HANDLING**

In [None]:
%load_ext pycodestyle_magic

In [None]:
%pycodestyle_on

In [None]:
# Reproducibility.
SEED = 42

***

***

# Split the dataset folder

## Ratio (imbalanced)

## Ratio to create reduced sample

## Fixed (balanced)

***

***

# Compute the mean and std of the dataset

## Libraries and modules

In [None]:
import os

import torch
import torchvision

import matplotlib.pyplot as plt
import numpy as np
import math
import random

## Reproducibility

In [None]:
# Seed torch and numpy.
os.environ['PYTHONHASHSEED'] = str(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Enable CUDNN deterministic mode.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Issues a warning if it is not met.
torch.use_deterministic_algorithms(True)

In [None]:
# For dataloaders.
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


g = torch.Generator()
g.manual_seed(SEED)

In [None]:
# Enable deterministic behavior using external GPU.
# %env CUBLAS_WORKSPACE_CONFIG=:4096:8
# %env CUBLAS_WORKSPACE_CONFIG=:16:8
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'

## Computation

In [None]:
from utils import listdir_fullpath
from utils import get_mean_std_dataloader

In [None]:
help(listdir_fullpath)

In [None]:
# List of trained models.
datasets_dir = 'datasets/'

# Get the subsets with full path.
data_dirs = listdir_fullpath(datasets_dir)
print(data_dirs)

In [None]:
help(get_mean_std_dataloader)

In [None]:
# Initialization.
splits = ['train', 'val', 'test']

# Loop over the datasets (except raw and clothing).
for data_dir in data_dirs[2:]:

    # Loading the datasets into a dic.
    datasets = {x: torchvision.datasets.ImageFolder(
        os.path.join(data_dir, x),
        transform=torchvision.transforms.ToTensor()
    ) for x in splits}

    # Creating the dataloaders into a dic.
    dataloaders = {x: torch.utils.data.DataLoader(
        datasets[x],
        batch_size=128,
        worker_init_fn=seed_worker,
        generator=g
    ) for x in splits}

    # Loop over the train, val, and test datasets.
    for x in splits:
        print(f'{data_dir}/{x}/')
        mean, std = get_mean_std_dataloader(dataloaders[x])
        print(mean)
        print(std)