University of Michigan

Master of Applied Data Science

SIADS699 - Capstone Project

Andre Onofre, Samantha Roska, Sawsan Allam

This Notebook: Dermatoscopic Images: Dataset and Images Preparation

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Import Libraries
import os
import pandas as pd
from PIL import Image
import numpy as np
from tqdm import tqdm

In [None]:
# Directories
IMAGES_DIR = './Images/BCN20000-all-images-original/bcn_20k_train/'
IMAGES_ARRAY_DIR = './Images_Arrays/'
META_DATA_DIR = './Metadata/'
MODELS_DIR = './Models/'
TEST_IMAGES_DIR = './Images/Images_for_tests/'
TRAINING_RESULTS_DIR = './Training_Results/'

In [None]:
# Load Metadata for the images
df = pd.read_csv(META_DATA_DIR + 'BCN20000_metadata_original.csv')

In [None]:
# Create Dictionary for Classes
classes_dict = {'AK':   [1, 0, 0, 0, 0, 0, 0, 0],
                'BCC':  [0, 1, 0, 0, 0, 0, 0, 0],
                'BKL':  [0, 0, 1, 0, 0, 0, 0, 0],
                'DF':   [0, 0, 0, 1, 0, 0, 0, 0],
                'MEL':  [0, 0, 0, 0, 1, 0, 0, 0],
                'NV':   [0, 0, 0, 0, 0, 1, 0, 0],
                'SCC':  [0, 0, 0, 0, 0, 0, 1, 0],
                'VASC': [0, 0, 0, 0, 0, 0, 0, 1]
                }

In [None]:
# Completement the dataset with images full path
df['class'] = df['diagnosis'].apply(lambda diagnostic: classes_dict[diagnostic])
df = df[['bcn_filename', 'diagnosis', 'class']]

In [None]:
# Save final dataset
df.to_csv(META_DATA_DIR + 'BCN20000-Metadata-One-Hot.csv')

In [None]:
# Check if all images are on the directory (12413)
print(len(os.listdir(IMAGES_DIR)))

In [None]:
# Prepare Array with all images resized and stacked + labels array
list_of_images = os.listdir(IMAGES_DIR)
combined_images_list = []
y = []
for image in tqdm(list_of_images):
  filename = IMAGES_DIR + image
  img = Image.open(filename)
  img_resized = img.resize((112, 112))
  img_array = np.array(img_resized)
  img_array_32 = img_array.astype(np.float32)
  combined_images_list.append(img_array_32)
  label = df[df['bcn_filename']==image]['class'].item()
  y.append(label)
images_array = np.array(combined_images_list)
y_array = np.array(y)
print(images_array.shape)
print(y_array.shape)

In [None]:
# Save Arrays to Files
np.save(IMAGES_ARRAY_DIR + 'BCN20000_images_array (112_112).npy', images_array)
np.save(IMAGES_ARRAY_DIR + 'BCN20000_labels_array (112_112).npy', y_array)
print('Arrays Saved!')