In [1]:
from tqdm import tqdm
from skimage.io import imread
import torch
from PIL import Image
from glob import glob
import pandas as pd
import numpy as np
import os
import torch.utils.data
import torchvision.transforms as trf

In [2]:
class DatasetCustom(torch.utils.data.Dataset):
    """Characterizes a dataset for PyTorch"""
    def __init__(self, df, transform=None):
        """Initialization"""
        self.df = df
        self.transform = transform

    def __len__(self):
        """Denotes the total number of samples"""
        return len(self.df)

    def __getitem__(self, index):
        """Generates one sample of data"""
        # Load data and get label
        img = Image.open(self.df['path'][index])
        label = torch.tensor(int(self.df['cell_type_idx'][index]))

        if self.transform:
            img = self.transform(img)

        return img, label

In [3]:
#dictionary lesion type
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

# define directory
train_skin_dir = os.path.join('../../../', 'HAM10000')

#dictionary {imageid : path}
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(train_skin_dir, '*', '*.jpg'))}

In [4]:
tile_df = pd.read_csv(os.path.join(train_skin_dir, 'metadata.csv')) #read csv
tile_df['cell_type'] = tile_df['dx'].map(lesion_type_dict.get) #lesion type code full name
tile_df['cell_type_idx'] = pd.Categorical(tile_df['cell_type']).codes #lesion type
tile_df['path'] = tile_df['image_id'].map(imageid_path_dict.get) #path of image
tile_df = tile_df.drop(['dx', 'dx_type', 'age', 'sex', 'localization', 'lesion_id'], axis=1) #drop columns

In [5]:
tile_df['image'] = tile_df['path'].map(imread)

In [6]:
for i in range(len(tile_df)):
    if tile_df['cell_type_idx'][i] in [0, 1, 2, 3, 5, 6]: #extend all categories except for nevus
        img_np = tile_df['image'][i]
        img = Image.fromarray(img_np.astype('uint8'), 'RGB')
        img2 = img.transpose(Image.ROTATE_90)
        img3 = img.transpose(Image.ROTATE_180)
        img4 = img.transpose(Image.ROTATE_270)

        tile_df.loc[len(tile_df.index)] = [tile_df['image_id'][i],
                                           tile_df['cell_type'][i],
                                           tile_df['cell_type_idx'][i],
                                           tile_df['path'][i],
                                           np.array(img2)]

        tile_df.loc[len(tile_df.index)] = [tile_df['image_id'][i],
                                           tile_df['cell_type'][i],
                                           tile_df['cell_type_idx'][i],
                                           tile_df['path'][i],
                                           np.array(img3)]

        tile_df.loc[len(tile_df.index)] = [tile_df['image_id'][i],
                                           tile_df['cell_type'][i],
                                           tile_df['cell_type_idx'][i],
                                           tile_df['path'][i],
                                           np.array(img4)]

In [7]:
#tile_df[['cell_type_idx', 'cell_type']].sort_values('cell_type_idx').drop_duplicates()

In [8]:
composed = trf.Compose([trf.ColorJitter(contrast=(1.4, 1.5)), trf.Resize(300), trf.CenterCrop(224), trf.ToTensor(),
                        trf.Normalize(mean=[0, 0, 0], std=[1, 1, 1])])

tile_set = DatasetCustom(tile_df, transform=composed)
tile_generator = torch.utils.data.DataLoader(tile_set, batch_size=16, shuffle=True)

In [9]:
####### COMPUTE MEAN / STD

# placeholders
psum    = torch.tensor([0.0, 0.0, 0.0])
psum_sq = torch.tensor([0.0, 0.0, 0.0])

# a = tile_df['image'].mean(axis = [1, 2])
# print(a)
# loop through images

for image, label in tile_generator:
    psum    += image.sum(axis        = [0, 2, 3])
    psum_sq += (image ** 2).sum(axis = [0, 2, 3])

In [10]:
####### FINAL CALCULATIONS

# pixel count
count = len(tile_df) * 224 * 224

# mean and std
total_mean = psum / count
total_var  = (psum_sq / count) - (total_mean ** 2)
total_std  = torch.sqrt(total_var)

# output
print('mean: '  + str(total_mean))
print('std:  '  + str(total_std))

mean: tensor([0.7708, 0.4684, 0.4987])
std:  tensor([0.1907, 0.2218, 0.2520])
