In [16]:
from __future__ import absolute_import, division, print_function

# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import shutil
print(tf.__version__)

np.random.seed(123)

1.13.1


In [6]:
labels_file = "ISIC_2019_Training_GroundTruth.csv"
labels = pd.read_csv(labels_file, index_col=0, parse_dates=True)

# Split Train & Val set such that label distributions match

In [7]:
train_labels = pd.DataFrame()
val_labels = pd.DataFrame()
train_percent = 0.9

for col in labels.columns:
    curr_class = labels.loc[labels[col]==1]
    N_curr = len(curr_class)
    np.random.seed(123)
    perm = np.random.permutation(N_curr)
    curr_class = curr_class.iloc[perm]
    curr_train = curr_class.iloc[:int(train_percent*N_curr)]
    curr_val = curr_class.iloc[int(train_percent*N_curr):]
    
    train_labels = train_labels.append(curr_train)
    val_labels = val_labels.append(curr_val)

In [8]:
train_class_dist = train_labels.sum(axis = 0, skipna = True)
val_class_dist = val_labels.sum(axis=0,skipna=True)
print("Train class distribution\n",train_class_dist/sum(train_class_dist))
print("\nVal class distribution\n",val_class_dist/sum(val_class_dist))

Train class distribution
 MEL     0.178512
NV      0.508336
BCC     0.131175
AK      0.034220
BKL     0.103580
DF      0.009432
VASC    0.009959
SCC     0.024787
UNK     0.000000
dtype: float64

Val class distribution
 MEL     0.178557
NV      0.507686
BCC     0.131257
AK      0.034292
BKL     0.103666
DF      0.009460
VASC    0.010248
SCC     0.024832
UNK     0.000000
dtype: float64


Success!

# Split train and val data into directories

In [9]:
train_names = train_labels.index
val_names = val_labels.index

In [10]:
train_labels = train_labels.sort_values("image")
val_labels = val_labels.sort_values("image")

In [14]:
for curr_class in labels.columns:
    path = os.path.join("data","val",curr_class)
    if not os.path.exists(path):
        os.mkdir(path)
        
for curr_class in labels.columns:
    path = os.path.join("data","train",curr_class)
    if not os.path.exists(path):
        os.mkdir(path)

In [17]:
src = "ISIC_2019_Training_Input"
for i in range(len(train_labels)):
    label = train_labels.iloc[i]
    curr_class = label[label==1].index[0]
    dest = os.path.join("data","train",curr_class)
    name = label.name + ".jpg"
    full_file_name = os.path.join(src,name)
    if (os.path.isfile(full_file_name)):
        shutil.copy(full_file_name, dest)

In [18]:
src = "ISIC_2019_Training_Input"
for i in range(len(val_labels)):
    label = val_labels.iloc[i]
    curr_class = label[label==1].index[0]
    dest = os.path.join("data","val",curr_class)
    name = label.name + ".jpg"
    full_file_name = os.path.join(src,name)
    if (os.path.isfile(full_file_name)):
        shutil.copy(full_file_name, dest)