**DATASET HANDLING**

In [1]:
%load_ext pycodestyle_magic

In [2]:
%pycodestyle_on

***

***

# Dataset stats

I use this code to check the resulting number of samples per class after splitting the target dataset.

## Modules

In [3]:
import torchvision
import numpy as np
import pandas as pd

  return torch._C._cuda_getDeviceCount() > 0


## Analysis

In [4]:
# Target dataset.
data_dir_target = 'datasets/0_Raw/' \
                  'Sentinel2GlobalLULC_full_raw/' \
                  'Sentinel2LULC_JPEG/'

# Loading the three datasets.
data = torchvision.datasets.ImageFolder(data_dir_target)

# Get classes and number of samples per class.
class_names = data.classes
samples_per_class = np.unique(data.targets, return_counts=True)[1]

# Building the dataframe.
df = pd.DataFrame(class_names, columns=['Class'])
df.set_index('Class', drop=True, inplace=True)
df['100%_samples'] = samples_per_class
df['E1-T-95%'] = (df['100%_samples']*0.95).astype(int)
df['E1-V-1.75%'] = (df['100%_samples']*0.175).astype(int)
df['E1-T-3.25%'] = (df['100%_samples']*0.325).astype(int)
df

Unnamed: 0_level_0,100%_samples,E1-T-95%,E1-V-1.75%,E1-T-3.25%
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
01_BarrenLands___jpeg,14000,13300,2450,4550
02_MossAndLichen_jpeg,4656,4423,814,1513
03_Grasslands____jpeg,8869,8425,1552,2882
04_ShrublandOpen_jpeg,14000,13300,2450,4550
05_SrublandClose_jpeg,11937,11340,2088,3879
06_ForestsOpDeBr_jpeg,4437,4215,776,1442
07_ForestsClDeBr_jpeg,1348,1280,235,438
08_ForestsDeDeBr_jpeg,14000,13300,2450,4550
09_ForestsOpDeNe_jpeg,10438,9916,1826,3392
10_ForestsClDeNe_jpeg,6380,6061,1116,2073


***

***

# Split the dataset folder

## Full dataset (imbalanced)

## Reduced dataset (imbalanced)

## Full dataset (val and test balanced)

I cannot give more samples to the validation and test datasets since the class with fewer samples has only 353 (27).

***

***

# Compute the mean and std of the dataset

## Libraries and modules

In [3]:
import os

import torch
import torchvision

import matplotlib.pyplot as plt
import numpy as np
import math
import random

import utils

In [4]:
# help(listdir_fullpath)

In [5]:
# help(get_mean_std_dataloader)

## Reproducibility

In [6]:
exp = utils.Experiment()
exp.reproducibility()

## Computation

In [7]:
# List of trained models.
datasets_dir = 'datasets/'

# Get the subsets with full path.
data_dirs = utils.listdir_fullpath(datasets_dir)

# Leave out unwanted subsets.
data_dirs = data_dirs[2:]
for dirs in data_dirs:
    print(dirs)

datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42
datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42
datasets/Sentinel2GlobalLULC_full-ratio=(0.7, 0.1, 0.2)-seed=42


In [8]:
# Initialization.
splits = ['train', 'val', 'test']
filename = 'dataset_mean_std.txt'

# Loop over the datasets (except raw and clothing).
for data_dir in data_dirs:

    # Create path to the txt file.
    filepath = os.path.join(data_dir, filename)

    # Removing the old txt file if exists.
    if os.path.exists(filepath):
        os.remove(filepath)
        print('Old txt file removed and new one created.')
    else:
        print('New txt file created.')

    # Creating/opening the file.
    f = open(filepath, 'w')

    # Loading the datasets into a dic.
    datasets = {x: torchvision.datasets.ImageFolder(
        os.path.join(data_dir, x),
        transform=torchvision.transforms.ToTensor()
    ) for x in splits}

    # Creating the dataloaders into a dic.
    dataloaders = {x: torch.utils.data.DataLoader(
        datasets[x],
        batch_size=128,
        worker_init_fn=exp.seed_worker,
        generator=exp.g
    ) for x in splits}

    # Loop over the train, val, and test datasets.
    for x in splits:

        # Computation.
        print(f'{data_dir}/{x}/')
        print(f'Samples to be processed: '
              f'{len(dataloaders[x].dataset)}')
        mean, std = utils.get_mean_std_dataloader(dataloaders[x])
        print(mean)
        print(std)

        # Write to file.
        f.write(f'{x}\n')
        f.write(f'{mean}\n')
        f.write(f'{std}\n')

    # Close file and print a space.
    f.close()
    print('')

Old txt file removed and new one created.
datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42/train/
Samples processed: 187627
tensor([0.3350, 0.3388, 0.3616])
tensor([0.2996, 0.2394, 0.2130])
datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42/val/
Samples processed: 2900
tensor([0.2768, 0.2996, 0.3267])
tensor([0.2352, 0.1857, 0.1670])
datasets/Sentinel2GlobalLULC_full-fixed=(100, 150)-seed=42/test/
Samples processed: 4350
tensor([0.2741, 0.2973, 0.3245])
tensor([0.2338, 0.1833, 0.1643])

Old txt file removed and new one created.
datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42/train/
Samples processed: 1938
tensor([0.3341, 0.3395, 0.3636])
tensor([0.2904, 0.2328, 0.2091])
datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42/val/
Samples processed: 1938
tensor([0.3357, 0.3382, 0.3616])
tensor([0.2942, 0.2330, 0.2069])
datasets/Sentinel2GlobalLULC_full-ratio=(0.01, 0.01, 0.98)-seed=42/test/
Samples processed: 191001
tensor([0.3327, 0.3372,

***

***