**DATASET HANDLING**

In [1]:
%load_ext pycodestyle_magic

In [2]:
%pycodestyle_on

In [3]:
# Reproducibility.
SEED = 42

***

***

# Split the dataset folder

## Full dataset (imbalanced)

## Reduced dataset (imbalanced)

## Full dataset (val and test balanced)

I cannot give more samples to the validation and test datasets since the class with fewer samples has only 353 (27).

***

***

# Compute the mean and std of the dataset

## Libraries and modules

In [4]:
import os

import torch
import torchvision

import matplotlib.pyplot as plt
import numpy as np
import math
import random

## Reproducibility

In [5]:
# Seed torch and numpy.
os.environ['PYTHONHASHSEED'] = str(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Enable CUDNN deterministic mode.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Issues a warning if it is not met.
torch.use_deterministic_algorithms(True)

In [6]:
# For dataloaders.
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


g = torch.Generator()
g.manual_seed(SEED)

<torch._C.Generator at 0x7efc5e28c250>

In [7]:
# Enable deterministic behavior using external GPU.
# %env CUBLAS_WORKSPACE_CONFIG=:4096:8
# %env CUBLAS_WORKSPACE_CONFIG=:16:8
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'

## Computation

In [8]:
from utils import listdir_fullpath
from utils import get_mean_std_dataloader

In [9]:
# help(listdir_fullpath)

In [10]:
# List of trained models.
datasets_dir = 'datasets/'

# Get the subsets with full path.
data_dirs = listdir_fullpath(datasets_dir)
print(data_dirs)

['datasets/0_Raw', 'datasets/Clothing-dataset', 'datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42', 'datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42', 'datasets/Sentinel2GlobalLULC_full-ratio=(0.7, 0.1, 0.2)-seed=42']


In [11]:
# help(get_mean_std_dataloader)

In [12]:
# Initialization.
splits = ['train', 'val', 'test']

# Loop over the datasets (except raw and clothing).
for data_dir in data_dirs[2:]:

    # Loading the datasets into a dic.
    datasets = {x: torchvision.datasets.ImageFolder(
        os.path.join(data_dir, x),
        transform=torchvision.transforms.ToTensor()
    ) for x in splits}

    # Creating the dataloaders into a dic.
    dataloaders = {x: torch.utils.data.DataLoader(
        datasets[x],
        batch_size=128,
        worker_init_fn=seed_worker,
        generator=g
    ) for x in splits}

    # Loop over the train, val, and test datasets.
    for x in splits:
        print(f'{data_dir}/{x}/')
        print(f'Samples processed: '
              f'{len(dataloaders[x].dataset)}')
        mean, std = get_mean_std_dataloader(dataloaders[x])
        print(mean)
        print(std)

datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42/train/
Samples processed in train dataset: 187627
tensor([0.3350, 0.3388, 0.3616])
tensor([0.2996, 0.2394, 0.2130])
datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42/val/
Samples processed in val dataset: 2900
tensor([0.2768, 0.2996, 0.3267])
tensor([0.2352, 0.1857, 0.1670])
datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42/test/
Samples processed in test dataset: 4350
tensor([0.2741, 0.2973, 0.3245])
tensor([0.2338, 0.1833, 0.1643])
datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42/train/
Samples processed in train dataset: 1938
tensor([0.3341, 0.3395, 0.3636])
tensor([0.2904, 0.2328, 0.2091])
datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42/val/
Samples processed in val dataset: 1938
tensor([0.3357, 0.3382, 0.3616])
tensor([0.2942, 0.2330, 0.2069])
datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42/test/
Samples processed in test dataset: 191001
tensor([0.33

datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42/train/
Samples processed in train dataset: 187627
tensor([0.3350, 0.3388, 0.3616])
tensor([0.2996, 0.2394, 0.2130])
datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42/val/
Samples processed in val dataset: 2900
tensor([0.2768, 0.2996, 0.3267])
tensor([0.2352, 0.1857, 0.1670])
datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42/test/
Samples processed in test dataset: 4350
tensor([0.2741, 0.2973, 0.3245])
tensor([0.2338, 0.1833, 0.1643])
datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42/train/
Samples processed in train dataset: 1938
tensor([0.3341, 0.3395, 0.3636])
tensor([0.2904, 0.2328, 0.2091])
datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42/val/
Samples processed in val dataset: 1938
tensor([0.3357, 0.3382, 0.3616])
tensor([0.2942, 0.2330, 0.2069])
datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42/test/
Samples processed in test dataset: 191001
tensor([0.3327, 0.3372, 0.3603])
tensor([0.2976, 0.2376, 0.2115])
datasets/Sentinel2GlobalLULC_full-ratio=(0.7, 0.1, 0.2)-seed=42/train/
Samples processed in train dataset: 136403
tensor([0.3329, 0.3373, 0.3603])
tensor([0.2978, 0.2377, 0.2115])
datasets/Sentinel2GlobalLULC_full-ratio=(0.7, 0.1, 0.2)-seed=42/val/
Samples processed in val dataset: 19478
tensor([0.3318, 0.3363, 0.3597])
tensor([0.2962, 0.2364, 0.2104])
datasets/Sentinel2GlobalLULC_full-ratio=(0.7, 0.1, 0.2)-seed=42/test/
Samples processed in test dataset: 38996
tensor([0.3328, 0.3375, 0.3605])
tensor([0.2974, 0.2378, 0.2118])