In [1]:
import random
import os

from collections import Counter
from PIL import Image

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

import tensorflow as tf

os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(
    str(i) for i in range(len(tf.config.experimental.list_physical_devices('GPU'))))
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
for i in range(len(tf.config.experimental.list_physical_devices('GPU'))):
    tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices('GPU')[i], True)

keras = tf.keras
from keras.utils import array_to_img, img_to_array, load_img
from keras import models
from keras import layers

from imblearn.under_sampling import NearMiss
from sklearn.model_selection import train_test_split

# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T

2022-11-20 18:28:47.748923: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-20 18:28:48.012320: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-20 18:28:48.598991: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-20 18:28:48.599144: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
# Constants
DATA_PATH = '../src/data/external/'
IMAGE_PATH = f'{DATA_PATH}/ISIC_2019_Training_Input/'
IMAGE_PREFIX = '.jpg'
TARGET_SIZE = (32, 32)
CHANNELS = 3
CONTRAST_FACTOR = 3
DELTA = 0.3
RANDOM_STATE = 42
TEST_SIZE = 0.2

In [3]:
def reset_random_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [4]:
df_metadata = pd.read_csv(f'{DATA_PATH}/ISIC_2019_Training_Metadata.csv')
df_truth = pd.read_csv(f'{DATA_PATH}/ISIC_2019_Training_GroundTruth.csv')

In [5]:
labels = df_truth.columns
labels = labels[1:]
print(labels)
label_mapping = {i: label for i, label in enumerate(labels)}
print(label_mapping)

Index(['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK'], dtype='object')
{0: 'MEL', 1: 'NV', 2: 'BCC', 3: 'AK', 4: 'BKL', 5: 'DF', 6: 'VASC', 7: 'SCC', 8: 'UNK'}


In [6]:
dense_labels = df_truth[labels]
dense_labels = dense_labels.values
print('Dense Labels...')
display(dense_labels)
truth_labels = np.argmax(dense_labels, axis=-1)
print('Truth Labels...')
display(truth_labels)

Dense Labels...


array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Truth Labels...


array([1, 1, 0, ..., 0, 1, 4])

In [7]:
df_truth['label'] = truth_labels.tolist()
display(df_truth)

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,ISIC_0000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,ISIC_0000004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
25327,ISIC_0073248,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4
25328,ISIC_0073249,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
25329,ISIC_0073251,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [8]:
df_joined = df_metadata.join(df_truth.set_index('image'), on='image', how='left')
display(df_joined)

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label
0,ISIC_0000000,55.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,ISIC_0000001,30.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,ISIC_0000002,60.0,upper extremity,,female,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,ISIC_0000003,30.0,upper extremity,,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,ISIC_0000004,80.0,posterior torso,,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [9]:
# Site details add more information to model and so fill with unknown if not available
print(f"Count of null values in anatom_site_general column before null fill: {sum(df_joined['anatom_site_general'].isnull())}")
df_joined['anatom_site_general'] = df_joined['anatom_site_general'].fillna('unknown')
print(f"Count of null values in anatom_site_general column after null fill: {sum(df_joined['anatom_site_general'].isnull())}")

Count of null values in anatom_site_general column before null fill: 2631
Count of null values in anatom_site_general column after null fill: 0


In [10]:
# Sex details add more information to model and so fill with unknown if not available
print(f"Count of null values in sex column before null fill: {sum(df_joined['sex'].isnull())}")
df_joined['sex'] = df_joined['sex'].fillna('unknown')
print(f"Count of null values in sex column after null fill: {sum(df_joined['sex'].isnull())}")

Count of null values in sex column before null fill: 384
Count of null values in sex column after null fill: 0


In [11]:
# Age is important for the model and so we remove rows with na/inf/zero values
df_cleaned = df_joined[~df_joined['age_approx'].isnull()]
print(f"Observations in cleaned dataframe before na/inf/zero remove: {len(df_cleaned)}")
df_cleaned.replace([np.inf, -np.inf], np.nan, inplace=True)
df_cleaned.dropna(how="all", inplace=True)
df_cleaned = df_cleaned[df_cleaned['age_approx'] > 0]
print(f"Observations in cleaned dataframe after na/inf/zero remove: {len(df_cleaned)}")

Observations in cleaned dataframe before na/inf/zero remove: 24894
Observations in cleaned dataframe after na/inf/zero remove: 24840


In [12]:
# Convert age column to uint8 for resampling
# df_cleaned['age_approx'] = df_cleaned['age_approx'].astype(np.uint8)
# display(df_cleaned)

In [13]:
# decreasing the scope of anatomy site can provide more information and so getting a more generic information
df_cleaned['anatomy_site'] = df_cleaned['anatom_site_general'].apply(lambda anatomy: anatomy.split()[-1])
display(df_cleaned)

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label,anatomy_site
0,ISIC_0000000,55.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso
1,ISIC_0000001,30.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso
2,ISIC_0000002,60.0,upper extremity,,female,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,extremity
3,ISIC_0000003,30.0,upper extremity,,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity
4,ISIC_0000004,80.0,posterior torso,,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,torso
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,head/neck
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,torso
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,extremity
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,palms/soles


In [14]:
columns_pixels = [f"pixel_{i}" for i in range(TARGET_SIZE[0] * TARGET_SIZE[1] * CHANNELS)]
columns_dump = ['image', 'age_approx', 'anatom_site_general', 'sex', 'anatomy_site', *columns_pixels, 'label']
columns_cat = ['anatom_site_general', 'sex', 'anatomy_site']

In [15]:
def build_image_df(image_names):
    records = []
    for img_name in image_names:
        img = load_img(f"{IMAGE_PATH}/{img_name}{IMAGE_PREFIX}", target_size=TARGET_SIZE)
        img = img_to_array(img)
        records.append([img_name, *img.flatten()])
    return pd.DataFrame(records, columns=["image", *columns_pixels])

In [16]:
df_image = build_image_df(df_cleaned['image'])
display(df_image)

Unnamed: 0,image,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,...,pixel_3062,pixel_3063,pixel_3064,pixel_3065,pixel_3066,pixel_3067,pixel_3068,pixel_3069,pixel_3070,pixel_3071
0,ISIC_0000000,144.0,191.0,233.0,162.0,202.0,237.0,169.0,207.0,244.0,...,254.0,185.0,226.0,254.0,188.0,233.0,255.0,183.0,227.0,254.0
1,ISIC_0000001,135.0,134.0,142.0,150.0,148.0,151.0,159.0,158.0,163.0,...,188.0,177.0,177.0,187.0,177.0,177.0,187.0,174.0,174.0,186.0
2,ISIC_0000002,4.0,4.0,4.0,5.0,5.0,5.0,6.0,6.0,6.0,...,215.0,111.0,140.0,208.0,76.0,101.0,165.0,35.0,49.0,84.0
3,ISIC_0000003,222.0,221.0,226.0,219.0,219.0,221.0,217.0,217.0,219.0,...,219.0,225.0,225.0,227.0,225.0,225.0,227.0,225.0,225.0,227.0
4,ISIC_0000004,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24835,ISIC_0073247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
24836,ISIC_0073248,99.0,93.0,97.0,112.0,105.0,112.0,121.0,116.0,123.0,...,127.0,114.0,107.0,114.0,104.0,96.0,107.0,83.0,73.0,81.0
24837,ISIC_0073249,129.0,123.0,123.0,119.0,102.0,108.0,106.0,88.0,88.0,...,233.0,217.0,231.0,232.0,215.0,226.0,228.0,210.0,221.0,223.0
24838,ISIC_0073251,143.0,120.0,136.0,149.0,128.0,143.0,150.0,132.0,146.0,...,146.0,141.0,120.0,137.0,135.0,114.0,129.0,123.0,103.0,115.0


In [17]:
df_complete = df_cleaned.join(df_image.set_index('image'), on='image', how='left')
display(df_complete)

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,...,pixel_3062,pixel_3063,pixel_3064,pixel_3065,pixel_3066,pixel_3067,pixel_3068,pixel_3069,pixel_3070,pixel_3071
0,ISIC_0000000,55.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,...,254.0,185.0,226.0,254.0,188.0,233.0,255.0,183.0,227.0,254.0
1,ISIC_0000001,30.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,...,188.0,177.0,177.0,187.0,177.0,177.0,187.0,174.0,174.0,186.0
2,ISIC_0000002,60.0,upper extremity,,female,1.0,0.0,0.0,0.0,0.0,...,215.0,111.0,140.0,208.0,76.0,101.0,165.0,35.0,49.0,84.0
3,ISIC_0000003,30.0,upper extremity,,male,0.0,1.0,0.0,0.0,0.0,...,219.0,225.0,225.0,227.0,225.0,225.0,227.0,225.0,225.0,227.0
4,ISIC_0000004,80.0,posterior torso,,male,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,...,127.0,114.0,107.0,114.0,104.0,96.0,107.0,83.0,73.0,81.0
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,...,233.0,217.0,231.0,232.0,215.0,226.0,228.0,210.0,221.0,223.0
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,...,146.0,141.0,120.0,137.0,135.0,114.0,129.0,123.0,103.0,115.0


In [18]:
df_dump = df_complete.copy()
image_index = {name: i for i, name in enumerate(df_complete['image'])}
rev_image_index = {i: name for i, name in enumerate(df_complete['image'])}
df_dump['image'] = df_dump['image'].apply(lambda name: image_index[name])
df_dump = df_dump[columns_dump]

for cat in columns_cat:
    one_hot = pd.get_dummies(df_dump[cat], prefix=cat)
    df_dump = df_dump.drop(cat, axis=1)
    df_dump = df_dump.join(one_hot)

display(df_dump)
display(df_dump.describe())

Unnamed: 0,image,age_approx,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,...,anatom_site_general_unknown,anatom_site_general_upper extremity,sex_female,sex_male,anatomy_site_extremity,anatomy_site_head/neck,anatomy_site_oral/genital,anatomy_site_palms/soles,anatomy_site_torso,anatomy_site_unknown
0,0,55.0,144.0,191.0,233.0,162.0,202.0,237.0,169.0,207.0,...,0,0,1,0,0,0,0,0,1,0
1,1,30.0,135.0,134.0,142.0,150.0,148.0,151.0,159.0,158.0,...,0,0,1,0,0,0,0,0,1,0
2,2,60.0,4.0,4.0,4.0,5.0,5.0,5.0,6.0,6.0,...,0,1,1,0,1,0,0,0,0,0
3,3,30.0,222.0,221.0,226.0,219.0,219.0,221.0,217.0,217.0,...,0,1,0,1,1,0,0,0,0,0
4,4,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,24835,85.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,1,0,0,0,0
25327,24836,65.0,99.0,93.0,97.0,112.0,105.0,112.0,121.0,116.0,...,0,0,0,1,0,0,0,0,1,0
25328,24837,70.0,129.0,123.0,123.0,119.0,102.0,108.0,106.0,88.0,...,0,0,0,1,1,0,0,0,0,0
25329,24838,55.0,143.0,120.0,136.0,149.0,128.0,143.0,150.0,132.0,...,0,0,1,0,0,0,0,1,0,0


Unnamed: 0,image,age_approx,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,...,anatom_site_general_unknown,anatom_site_general_upper extremity,sex_female,sex_male,anatomy_site_extremity,anatomy_site_head/neck,anatomy_site_oral/genital,anatomy_site_palms/soles,anatomy_site_torso,anatomy_site_unknown
count,24840.0,24840.0,24840.0,24840.0,24840.0,24840.0,24840.0,24840.0,24840.0,24840.0,...,24840.0,24840.0,24840.0,24840.0,24840.0,24840.0,24840.0,24840.0,24840.0,24840.0
mean,12419.5,54.145934,123.411194,99.411636,102.223549,131.498428,106.42379,109.1409,137.425644,111.619522,...,0.097182,0.11558,0.46715,0.53285,0.314332,0.184018,0.002375,0.01566,0.386433,0.097182
std,7170.834679,17.974619,89.094986,72.416664,75.291954,86.965065,71.187691,74.116867,85.213074,70.088135,...,0.296211,0.319727,0.49893,0.49893,0.464259,0.387506,0.048679,0.12416,0.486942,0.296211
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6209.75,40.0,6.0,6.0,6.0,24.0,19.0,20.0,55.0,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,12419.5,55.0,153.0,122.0,124.0,161.0,129.0,132.0,166.0,134.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,18629.25,70.0,201.0,158.0,166.0,203.0,161.0,169.0,205.0,164.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,24839.0,85.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
# We use undersampling since we assume that the amount of data is more than sufficient and adding more synthetic data using oversampling is not that useful in our case
# Sample references to existing downsampling use-cases:
# https://www.quora.com/Is-it-a-good-idea-to-undersample-or-oversample-a-heavily-imbalanced-dataset-from-a-statistical-perspective
X_dump = df_dump.drop(columns=['label'])
display(X_dump)
y_dump = df_dump['label']
display(y_dump)

X_resampled, _, y_resampled, _ = train_test_split(X_dump, y_dump, test_size=TEST_SIZE, random_state=RANDOM_STATE)


# TODO change all cells from float to int
under_sampler = NearMiss()
X_resampled, y_resampled = under_sampler.fit_resample(X_resampled, y_resampled)
print(f'Resampled dataset shape {dict(Counter(y_resampled))}')

Unnamed: 0,image,age_approx,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,...,anatom_site_general_unknown,anatom_site_general_upper extremity,sex_female,sex_male,anatomy_site_extremity,anatomy_site_head/neck,anatomy_site_oral/genital,anatomy_site_palms/soles,anatomy_site_torso,anatomy_site_unknown
0,0,55.0,144.0,191.0,233.0,162.0,202.0,237.0,169.0,207.0,...,0,0,1,0,0,0,0,0,1,0
1,1,30.0,135.0,134.0,142.0,150.0,148.0,151.0,159.0,158.0,...,0,0,1,0,0,0,0,0,1,0
2,2,60.0,4.0,4.0,4.0,5.0,5.0,5.0,6.0,6.0,...,0,1,1,0,1,0,0,0,0,0
3,3,30.0,222.0,221.0,226.0,219.0,219.0,221.0,217.0,217.0,...,0,1,0,1,1,0,0,0,0,0
4,4,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,24835,85.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,1,0,0,0,0
25327,24836,65.0,99.0,93.0,97.0,112.0,105.0,112.0,121.0,116.0,...,0,0,0,1,0,0,0,0,1,0
25328,24837,70.0,129.0,123.0,123.0,119.0,102.0,108.0,106.0,88.0,...,0,0,0,1,1,0,0,0,0,0
25329,24838,55.0,143.0,120.0,136.0,149.0,128.0,143.0,150.0,132.0,...,0,0,1,0,0,0,0,1,0,0


0        1
1        1
2        0
3        1
4        0
        ..
25326    2
25327    4
25328    0
25329    1
25330    4
Name: label, Length: 24840, dtype: int64

Resampled dataset shape {0: 190, 1: 190, 2: 190, 3: 190, 4: 190, 5: 190, 6: 190, 7: 190}


In [20]:
images_seen = set(X_resampled['image'])
df_cleaned['is_seen'] = df_cleaned['image'].apply(lambda name: image_index[name] in images_seen)

df_seen = df_cleaned[df_cleaned['is_seen']].copy()
display(df_seen)
df_test = df_cleaned[df_cleaned['is_seen'] == False].copy()
display(df_test)

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label,anatomy_site,is_seen
2915,ISIC_0024318,65.0,lower extremity,HAM_0002450,female,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5,extremity,True
2967,ISIC_0024370,55.0,unknown,HAM_0001780,male,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6,unknown,True
2983,ISIC_0024386,40.0,lower extremity,HAM_0005112,female,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5,extremity,True
3072,ISIC_0024475,35.0,head/neck,HAM_0003873,male,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6,head/neck,True
3114,ISIC_0024517,65.0,posterior torso,HAM_0001894,male,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7,torso,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25256,ISIC_0073141,45.0,anterior torso,BCN_0005520,female,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5,torso,True
25259,ISIC_0073144,75.0,anterior torso,BCN_0004091,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso,True
25287,ISIC_0073193,35.0,anterior torso,BCN_0002147,male,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5,torso,True
25289,ISIC_0073195,70.0,lower extremity,BCN_0005492,male,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7,extremity,True


Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,label,anatomy_site,is_seen
0,ISIC_0000000,55.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso,False
1,ISIC_0000001,30.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,torso,False
2,ISIC_0000002,60.0,upper extremity,,female,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,extremity,False
3,ISIC_0000003,30.0,upper extremity,,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,extremity,False
4,ISIC_0000004,80.0,posterior torso,,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,torso,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,head/neck,False
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,torso,False
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,extremity,False
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,palms/soles,False


In [23]:
images_seen = [rev_image_index[image_seen] for image_seen in images_seen]
with open("../src/data/state/images_seen.txt", "w") as seen_file:
    seen_file.write('\n'.join(images_seen))