University of Michigan

Master of Applied Data Science

SIADS699 - Capstone Project

Andre Onofre, Samantha Roska, Sawsan Allam

This Notebook: Smartphone Images: Dataset and Images Preparation

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Import Libraries
import os
import pandas as pd
from PIL import Image
import numpy as np
from tqdm import tqdm

In [None]:
# Directories
IMAGES_DIR = './Images/PAD-UFES-20-all-images-RGB/'
IMAGES_ARRAY_DIR = './Images_Arrays/'
META_DATA_DIR = './Metadata/'
MODELS_DIR = './Models/'
TRAINING_RESULTS_DIR = './Training_Results/'
TEST_IMAGES_DIR = './Images/Images_for_tests/'

In [None]:
# Load PAD dataset (2298 images)
df_PAD = pd.read_csv(META_DATA_DIR + 'PAD-UFES-20-Metadata.csv') # Read CSV File
df_PAD = df_PAD[['img_id', 'diagnostic']] # Select only the useful columns
df_PAD['RBG_img_Id'] = df_PAD['img_id'].apply(lambda row: row[0:-4] + ".jpg") # Create filename with extension
df_PAD.drop('img_id', axis=1, inplace=True) # Delete non necessary columns
print('PAD Records: ', len(df_PAD))

In [None]:
# Load Monkeypox dataset (102 images)
df_MPX = pd.read_csv(META_DATA_DIR + 'Monkeypox_metadata.csv') # Read CSV File
df_MPX = df_MPX.rename(columns={'Label': 'diagnostic'}) # Rename Column
df_MPX['RBG_img_Id'] = df_MPX['ImageID'].apply(lambda row: row + ".jpg") # Create filename with extension
df_MPX.drop('ImageID', axis=1, inplace=True) # Delete non necessary columns
df_MPX = df_MPX[df_MPX['diagnostic']=='Monkeypox'] # Select only the MPX records
df_MPX['diagnostic'] = df_MPX['diagnostic'].replace('Monkeypox', 'MPX') # Rename records from 'Monkeypox' to 'MPX'
print('MPX Records: ', len(df_MPX))

In [None]:
# Concat Datasets
df = pd.concat([df_PAD, df_MPX])
print('Total Records: ', len(df))

In [None]:
# Create Dictionary for Classes
classes_dict = {'ACK': [1, 0, 0, 0, 0, 0, 0],
                'BCC': [0, 1, 0, 0, 0, 0, 0],
                'MEL': [0, 0, 1, 0, 0, 0, 0],
                'NEV': [0, 0, 0, 1, 0, 0, 0],
                'SCC': [0, 0, 0, 0, 1, 0, 0],
                'SEK': [0, 0, 0, 0, 0, 1, 0],
                'MPX': [0, 0, 0, 0, 0, 0, 1]}

# Include One hot encoding on dataset
df['class'] = df['diagnostic'].apply(lambda diagnostic: classes_dict[diagnostic])

In [None]:
# Save final dataset
df.to_csv(META_DATA_DIR + 'PAD-UFES-20-Metadata-One-Hot.csv')

In [None]:
# Check Images Directory (2400 images)
print(len(os.listdir(IMAGES_DIR)))

In [None]:
# Prepare Array with all images resized and stacked + labels array
list_of_images = os.listdir(IMAGES_DIR)
combined_images_list = []
y = []
for image in tqdm(list_of_images):
  filename = IMAGES_DIR + image
  img = Image.open(filename)
  img_resized = img.resize((112, 112))
  img_array = np.array(img_resized)
  img_array_normalized = img_array.astype('float32')
  combined_images_list.append(img_array_normalized)
  label = df[df['RBG_img_Id']==image]['class'].item()
  y.append(label)
images_array = np.array(combined_images_list)
y_array = np.array(y)
print(images_array.shape)
print(y_array.shape)

In [None]:
# Save Arrays to Files
np.save(IMAGES_ARRAY_DIR + 'PAD-UFES-20_images_array_112_112.npy', images_array)
np.save(IMAGES_ARRAY_DIR + 'PAD-UFES-20_labels_array_112_112.npy', y_array)
print('Arrays Saved!')