**DATASET HANDLING**

In [None]:
%load_ext pycodestyle_magic

In [None]:
%pycodestyle_on

***

***

# Dataset stats

I use this code to check the resulting number of samples per class after splitting the target dataset.

## Modules

In [None]:
import torchvision
import numpy as np
import pandas as pd

## Analysis

In [None]:
# Target dataset.
initial_dir_dataset = ('datasets/0_Raw/'
                       'Sentinel2GlobalLULC_full_raw/'
                       'Sentinel2LULC_JPEG/')

# Loading the three datasets.
data = torchvision.datasets.ImageFolder(initial_dir_dataset)

# Get classes and number of samples per class.
class_names = data.classes
samples_per_class = np.unique(data.targets, return_counts=True)[1]

# Building the dataframe.
df = pd.DataFrame(class_names, columns=['Class'])
df.set_index('Class', drop=True, inplace=True)
df['100%_samples'] = samples_per_class
df['E1-T-95%'] = (df['100%_samples']*0.95).astype(int)
df['E1-V-1.75%'] = (df['100%_samples']*0.175).astype(int)
df['E1-T-3.25%'] = (df['100%_samples']*0.325).astype(int)
df

***

***

# Compute the mean and std of the dataset

## Libraries and modules

In [None]:
import os

import torch
import torchvision

import matplotlib.pyplot as plt
import numpy as np
import math
import random

import utils

In [None]:
# help(listdir_fullpath)

In [None]:
# help(get_mean_std_dataloader)

## Reproducibility

In [None]:
exp = utils.Experiment()
exp.reproducibility()

## Computation

In [None]:
# List of trained models.
datasets_dir = 'datasets/'

# Get the subsets with full path.
data_dirs = utils.listdir_fullpath(datasets_dir)

# Leave out unwanted subsets.
data_dirs = data_dirs[2:]
for dirs in data_dirs:
    print(dirs)

In [None]:
# Initialization.
splits = ['train', 'val', 'test']
filename = 'dataset_mean_std.txt'

# Loop over the datasets (except raw and clothing).
for data_dir in data_dirs:

    # Create path to the txt file.
    filepath = os.path.join(data_dir, filename)

    # Removing the old txt file if exists.
    if os.path.exists(filepath):
        os.remove(filepath)
        print('Old txt file removed and new one created.')
    else:
        print('New txt file created.')

    # Creating/opening the file.
    f = open(filepath, 'w')

    # Loading the datasets into a dic.
    datasets = {x: torchvision.datasets.ImageFolder(
        os.path.join(data_dir, x),
        transform=torchvision.transforms.ToTensor()
    ) for x in splits}

    # Creating the dataloaders into a dic.
    dataloaders = {x: torch.utils.data.DataLoader(
        datasets[x],
        batch_size=128,
        worker_init_fn=exp.seed_worker,
        generator=exp.g
    ) for x in splits}

    # Loop over the train, val, and test datasets.
    for x in splits:

        # Computation.
        print(f'{data_dir}/{x}/')
        print(f'Samples to be processed: '
              f'{len(dataloaders[x].dataset)}')
        mean, std = utils.get_mean_std_dataloader(dataloaders[x])
        print(mean)
        print(std)

        # Write to file.
        f.write(f'{x}\n')
        f.write(f'{mean}\n')
        f.write(f'{std}\n')

    # Close file and print a space.
    f.close()
    print('')

***

***