In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

pd.options.mode.chained_assignment = None

random_state = 10
random.seed(random_state)

In [2]:
meta_df = pd.read_csv('../src/data/external/ISIC_2019_Training_Metadata.csv')
truth_df = pd.read_csv('../src/data/external/ISIC_2019_Training_GroundTruth.csv')

In [3]:
labels = truth_df.columns
labels = labels[1:]
print(labels)
label_mapping = {i: label for i, label in enumerate(labels)}
print(label_mapping)

Index(['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK'], dtype='object')
{0: 'MEL', 1: 'NV', 2: 'BCC', 3: 'AK', 4: 'BKL', 5: 'DF', 6: 'VASC', 7: 'SCC', 8: 'UNK'}


In [4]:
dense_labels = truth_df[labels]
dense_labels = dense_labels.values
print('Dense Labels...')
display(dense_labels)
train_labels = np.argmax(dense_labels, axis=-1)
print('Train Labels...')
display(train_labels)

Dense Labels...


array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Train Labels...


array([1, 1, 0, ..., 0, 1, 4])

In [5]:
truth_df['label'] = train_labels.tolist()
display(truth_df)

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,ISIC_0000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,ISIC_0000004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
25327,ISIC_0073248,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4
25328,ISIC_0073249,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
25329,ISIC_0073251,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [6]:
merged_df = meta_df.join(truth_df.set_index('image'), on='image', how='left')
display(merged_df)

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label
0,ISIC_0000000,55.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,ISIC_0000001,30.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,ISIC_0000002,60.0,upper extremity,,female,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,ISIC_0000003,30.0,upper extremity,,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,ISIC_0000004,80.0,posterior torso,,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [7]:
clean_df = merged_df[~merged_df['lesion_id'].isnull() & 
                     ~merged_df['anatom_site_general'].isnull() & 
                     ~merged_df['sex'].isnull() & 
                     ~merged_df['age_approx'].isnull()]
clean_df

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label
1459,ISIC_0012653_downsampled,50.0,posterior torso,MSK4_0011169,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1460,ISIC_0012654_downsampled,30.0,lower extremity,MSK4_0011170,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1461,ISIC_0012655_downsampled,35.0,upper extremity,MSK4_0011171,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1462,ISIC_0012656_downsampled,45.0,posterior torso,MSK4_0011172,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1463,ISIC_0012657_downsampled,20.0,upper extremity,MSK4_0011173,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [8]:
clean_df['anatomy_site'] = clean_df['anatom_site_general'].apply(lambda anatomy: anatomy.split()[-1])
clean_df['lesion_label'] = clean_df['lesion_id'].apply(lambda lesion: lesion.split('_')[0])
clean_df['lesion_code'] = clean_df['lesion_id'].apply(lambda lesion: lesion.split('_')[-1])
display(clean_df)
clean_df.describe()

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label,anatomy_site,lesion_label,lesion_code
1459,ISIC_0012653_downsampled,50.0,posterior torso,MSK4_0011169,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso,MSK4,0011169
1460,ISIC_0012654_downsampled,30.0,lower extremity,MSK4_0011170,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity,MSK4,0011170
1461,ISIC_0012655_downsampled,35.0,upper extremity,MSK4_0011171,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity,MSK4,0011171
1462,ISIC_0012656_downsampled,45.0,posterior torso,MSK4_0011172,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso,MSK4,0011172
1463,ISIC_0012657_downsampled,20.0,upper extremity,MSK4_0011173,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity,MSK4,0011173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,head/neck,BCN,0003925
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,torso,BCN,0001819
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,extremity,BCN,0001085
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,palms/soles,BCN,0002083


Unnamed: 0,age_approx,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label
count,21311.0,21311.0,21311.0,21311.0,21311.0,21311.0,21311.0,21311.0,21311.0,21311.0,21311.0
mean,55.125991,0.192436,0.457323,0.152269,0.039651,0.107691,0.011027,0.010417,0.029187,0.0,1.633523
std,17.952681,0.394223,0.498187,0.35929,0.195142,0.309997,0.104432,0.101534,0.168334,0.0,1.599651
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,70.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,85.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,7.0


In [9]:
def get_sampled_indices(df, ratio):
  total = len(df)
  sample_size = int(min(df['label'].value_counts()) * ratio)
  
  indices = []
  for label in label_mapping:
    observations = df[df['label'] == label].index.values
    if len(observations) >= sample_size:
      indices.extend(random.sample(observations.tolist(), sample_size))
  return indices

In [10]:
train_idx = get_sampled_indices(clean_df, 0.6)

unseen_idx = list(set(clean_df.index.values) - set(train_idx))
eval_idx = get_sampled_indices(clean_df.loc[unseen_idx], 0.5)

seen_idx = train_idx + eval_idx
unseen_idx = list(set(clean_df.index.values) - set(seen_idx))
test_idx = get_sampled_indices(clean_df.loc[unseen_idx], 1)
all_test_idx = list(set(clean_df.index.values) - set(seen_idx))

train_idx.sort()
eval_idx.sort()
test_idx.sort()
all_test_idx.sort()
# print(f"Train: {train_idx}")
# print(f"Eval: {eval_idx}")
# print(f"Test: {test_idx}")
# print(f"All-Test: {all_test_idx}")

In [11]:
test_df = clean_df.loc[test_idx]
display(test_df)
all_test_df = clean_df.loc[all_test_idx]
display(all_test_df)

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label,anatomy_site,lesion_label,lesion_code
1470,ISIC_0012664_downsampled,40.0,upper extremity,MSK4_0011178,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity,MSK4,0011178
1521,ISIC_0012746_downsampled,55.0,posterior torso,MSK4_0011262,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso,MSK4,0011262
1695,ISIC_0013159_downsampled,70.0,head/neck,MSK4_0011630,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,head/neck,MSK4,0011630
1701,ISIC_0013167_downsampled,55.0,anterior torso,MSK4_0010591,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,torso,MSK4,0010591
2081,ISIC_0014032_downsampled,60.0,head/neck,MSK4_0010292,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,head/neck,MSK4,0010292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25181,ISIC_0073022,85.0,lower extremity,BCN_0002592,male,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7,extremity,BCN,0002592
25232,ISIC_0073104,30.0,lower extremity,BCN_0000569,female,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,extremity,BCN,0000569
25271,ISIC_0073159,45.0,lower extremity,BCN_0001263,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity,BCN,0001263
25287,ISIC_0073193,35.0,anterior torso,BCN_0002147,male,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5,torso,BCN,0002147


Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label,anatomy_site,lesion_label,lesion_code
1459,ISIC_0012653_downsampled,50.0,posterior torso,MSK4_0011169,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso,MSK4,0011169
1460,ISIC_0012654_downsampled,30.0,lower extremity,MSK4_0011170,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity,MSK4,0011170
1461,ISIC_0012655_downsampled,35.0,upper extremity,MSK4_0011171,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity,MSK4,0011171
1462,ISIC_0012656_downsampled,45.0,posterior torso,MSK4_0011172,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso,MSK4,0011172
1463,ISIC_0012657_downsampled,20.0,upper extremity,MSK4_0011173,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity,MSK4,0011173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,head/neck,BCN,0003925
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,torso,BCN,0001819
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,extremity,BCN,0001085
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,palms/soles,BCN,0002083


In [12]:
# Predicting everything as MEL (label = 0) since that is the most dangerous type of skin lesion!!!
test_accuracy = (test_df['label'] == 0).sum() / len(test_df)
print(f"Accuracy on test data: {test_accuracy * 100}%")
all_test_accuracy = (all_test_df['label'] == 0).sum() / len(all_test_df)
print(f"Accuracy on all-test data: {all_test_accuracy * 100:.2f}%")

Accuracy on test data: 12.5%
Accuracy on all-test data: 19.72%
