# Imports

In [1]:
# %pip install pandas
# %pip install matplotlib
# %pip install opencv-python-headless
# %pip install scikit-image
# %pip install basic-image-eda
# %pip install seaborn
# %pip install torchvision
# %pip install sklearn
# %pip install pandas_profiling
# %pip install awswrangler
# %pip install ipywidgets

In [2]:
import os
import glob
import shutil
import time
import copy

import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np

import cv2

import matplotlib.pyplot as plt
import matplotlib.image as mp_image
import seaborn as sns

from IPython.display import Image, display

from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms
import torchvision.models as models

import torch
from torch.utils.data import DataLoader
from torch.utils.mobile_optimizer import optimize_for_mobile
import torch.nn as nn

from sklearn.metrics import confusion_matrix

import awswrangler as wr

# Data Load

In [3]:
s3_path = f's3://rubyhan-w210-datasets/full_data.csv'
full_df = wr.s3.read_csv(path=s3_path, index_col=0).rename(columns={'duplicated':'duplicate', 'class':'label'})

# ProfileReport(full_df)

In [4]:
full_df

Unnamed: 0,image_id,diagnosis,age,sex,localization,source,severity,path,label,duplicate,dataset,split_1,split_2,split_3
288,fissure-2,eczema photos,0.0,unknown,unknown,dermnet,unknown,./Data/dermnet/train/Nail Fungus and other Nai...,Non-Cancerous Skin Condition,True,,,,
319,id-reaction-10,eczema photos,0.0,unknown,unknown,dermnet,unknown,./Data/dermnet/train/Tinea Ringworm Candidiasi...,Non-Cancerous Skin Condition,True,,,,
344,id-reaction-7,eczema photos,0.0,unknown,unknown,dermnet,unknown,./Data/dermnet/train/Tinea Ringworm Candidiasi...,Non-Cancerous Skin Condition,True,,,,
363,fissure-5,eczema photos,0.0,unknown,unknown,dermnet,unknown,./Data/dermnet/train/Nail Fungus and other Nai...,Non-Cancerous Skin Condition,True,,,,
391,id-reaction-1,eczema photos,0.0,unknown,unknown,dermnet,unknown,./Data/dermnet/train/Tinea Ringworm Candidiasi...,Non-Cancerous Skin Condition,True,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19554,spitz-nevus-7,melanoma skin cancer nevi and moles,0.0,unknown,unknown,dermnet,unknown,./Data/dermnet/train/Melanoma Skin Cancer Nevi...,Potentially Malignant Skin Tumors,False,val,,train,val
19555,nevus-spilus-17,melanoma skin cancer nevi and moles,0.0,unknown,unknown,dermnet,unknown,./Data/dermnet/train/Melanoma Skin Cancer Nevi...,Potentially Malignant Skin Tumors,False,train,,train,train
19556,melanotic-macule-1,melanoma skin cancer nevi and moles,0.0,unknown,unknown,dermnet,unknown,./Data/dermnet/train/Melanoma Skin Cancer Nevi...,Potentially Malignant Skin Tumors,False,train,,train,train
19557,nevus-repigmented-4,melanoma skin cancer nevi and moles,0.0,unknown,unknown,dermnet,unknown,./Data/dermnet/train/Melanoma Skin Cancer Nevi...,Potentially Malignant Skin Tumors,False,train,,val,train


# Transforming Tabular to Folder Data Structure

In [5]:
# print(len(full_df[(full_df.source != 'ISIC_2020') & (full_df.dataset == 'train')]))
# print(len(full_df[(full_df.source != 'ISIC_2020') & (full_df.dataset == 'val')]))
# print(len(full_df[(full_df.source != 'ISIC_2020') & (full_df.dataset == 'test')]))

print(len(full_df[(full_df.source != 'ISIC_2020') & (full_df.split_3 == 'train')]))
print(len(full_df[(full_df.source != 'ISIC_2020') & (full_df.split_3 == 'val')]))
print(len(full_df[(full_df.source != 'ISIC_2020') & (full_df.split_3 == 'test')]))

10418
2103
1393


In [None]:
# full_df[full_df.path.str.contains('acne-keloidalis-1.jpg')]

In [13]:
cd ~/teledermatologyAI_capstone

/home/studio-lab-user/teledermatologyAI_capstone


In [23]:
class_list = full_df.label.unique().tolist()
class_list.remove('Autoimmue Disorder')
data_dir = 'data_class_folder2'

!mkdir -p $data_dir/train $data_dir/val $data_dir/test
for label in class_list:
    !mkdir -p $data_dir/train/"$label" 
    !mkdir -p $data_dir/val/"$label" 
    !mkdir -p $data_dir/test/"$label"

In [24]:
for i, row in full_df.iterrows():
    try:
        shutil.copy(row.path, f'{data_dir}/{row.split_3}/{row.label}')
    except FileNotFoundError as e:
        pass

In [25]:
# credits: https://github.com/yuliyabohdan/Skin-diseases-classification-Dermnet-/blob/main/skin_diseases_clas_ResNet50.ipynb

DIR = 'data_class_folder2'
DIR_TRAIN = f'{DIR}/train/'
DIR_VAL = f'{DIR}/val/'
DIR_TEST = f'{DIR}/test/' 

classes = os.listdir(DIR_TRAIN)
print(f'Total classes: {len(classes)}')

# total train, val and test images
train_count = 0
val_count = 0
test_count = 0

classes_df = []
for _class in classes:
    class_dict = {}
    train_count += len(os.listdir(DIR_TRAIN + _class))
    val_count += len(os.listdir(DIR_VAL + _class))
    test_count += len(os.listdir(DIR_TEST + _class))
    class_dict.update({'Class': _class, 
                       'Train': len(os.listdir(DIR_TRAIN + _class)),
                       'Val': len(os.listdir(DIR_VAL + _class)),
                       'Test': len(os.listdir(DIR_TEST + _class)) })
    classes_df.append(class_dict)

print(f'Total num train images: {train_count}')
print(f'Total num val images: {val_count}')
print(f'Total num test images: {test_count}')
print(pd.DataFrame(classes_df))

Total classes: 5
Total num train images: 10418
Total num val images: 2103
Total num test images: 1393
                                               Class  Train  Val  Test
0                       Non-Cancerous Skin Condition   2671  534   395
1  Toxin, Fungal, Bug, Viral, or Bacterial Infect...   2702  540   358
2                  Potentially Malignant Skin Tumors   2274  452   290
3                             Benign Marking or Mole   1871  392   246
4                                       Unclassified    900  185   104


In [26]:
# map class labels to integer index

train_imgs = []
val_imgs = []
test_imgs = []

for _class in classes:
    
    for img in os.listdir(DIR_TRAIN + _class):
        train_imgs.append(f'{DIR_TRAIN}{_class}/{img}')
    
    for img in os.listdir(DIR_VAL + _class):
        val_imgs.append(f'{DIR_VAL}{_class}/{img}')
    
    for img in os.listdir(DIR_TEST + _class):
        test_imgs.append(f'{DIR_TEST}{_class}/{img}')

classToInt = {classes[i]: i for i in range(len(classes))}
intToClass = dict(map(reversed, classToInt.items()))

for k, v in classToInt.items():
    print(k)

Non-Cancerous Skin Condition
Toxin, Fungal, Bug, Viral, or Bacterial Infections
Potentially Malignant Skin Tumors
Benign Marking or Mole
Unclassified


# Data Split/Transforms

In [27]:
train_dataset = ImageFolder(root = DIR_TRAIN, transform=transforms.Compose([
    transforms.RandomRotation([-8, +8]),                                           # if augmentation
    transforms.ColorJitter(brightness=0, contrast=0.4, saturation=0, hue=0),      # if augmentation
    transforms.RandomHorizontalFlip(),                                            # if augmentation
    transforms.Resize(255),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.676, 0.542, 0.519], std=[0.290, 0.226, 0.237])
]))

valid_dataset = ImageFolder(root = DIR_VAL, transform=transforms.Compose([
    transforms.Resize(255),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.676, 0.542, 0.519], std=[0.290, 0.226, 0.237])
]))

test_dataset = ImageFolder(root = DIR_TEST, transform=transforms.Compose([
    transforms.Resize(255),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.676, 0.542, 0.519], std=[0.290, 0.226, 0.237])
]))

In [28]:
# test_size = int(0.5 * len(test_dataset))
# valid_size = len(test_dataset) - test_size
# valid_dataset, test_dataset = torch.utils.data.random_split(test_dataset, 
#                                                             [valid_size, test_size])

# Train/Val Data Loader

In [29]:
dataloaders_dict = {}
dataloaders_dict['train'] = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=24)
dataloaders_dict['val'] = DataLoader(valid_dataset, batch_size=64, shuffle=False, num_workers=24, drop_last=False)
dataloader_test = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=24, drop_last=False)



# Modeling

In [30]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0


    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:               
                model.eval()   # Set model to evaluate mode
                #update_bn_stats(model=model, data_loader=dataloaders[phase])  # if update_bn_stats
                
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                      # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

               # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
   
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [31]:
def test_model(model, dl, normalize=True):
    model.eval()
    true_labels = []
    predictions = []
    total = 0
    num_correct = 0
    with torch.no_grad():
        for images, labels in dl:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            predicted = torch.argmax(outputs.data,-1)        
            true_labels.append(labels.cpu().numpy())
            predictions.append(predicted.cpu().numpy())
            total += labels.size(0)
            num_correct += (predicted == labels).sum()
        print(f'Test Accuracy of the model: {float(num_correct)/float(total)*100:.2f}')    
        true_labels = np.hstack(true_labels)
        predictions = np.hstack(predictions)

    return true_labels, predictions

In [32]:
# x, y - find the img from class x labelled as class y 
def test(model, dl, x, y, normalize=True):
    model.eval()
    true_labels = []
    predictions = []
    images_list = []

    with torch.no_grad():
        for images, labels in dl:
            images_list.append(images.cpu().numpy())
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            predicted = torch.argmax(outputs.data,-1)        
            true_labels.append(labels.cpu().numpy())
            predictions.append(predicted.cpu().numpy())
    
    for n in range(60):
        for i in range(32):
            if (true_labels[n][i] == x)  & (predictions[n][i] == y):
                #inv_tensor = inv_normalize(image_list[n][i]])
                plt.imshow(np.transpose(images_list[n][i], (1, 2, 0)))
                plt.show()

In [33]:
# Number of epochs to train for
num_epochs = 5 #100

model = models.resnet50(weights='DEFAULT')
model.fc = nn.Linear(2048, 6, bias=True)

# Detect if we have a GPU available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
model = model.to(device)

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.0001
)
# Setup the loss fxn
criterion = nn.CrossEntropyLoss()

cuda:0


In [None]:
# Train and evaluate
model, hist = train_model(model, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

Epoch 0/4
----------
train Loss: 1.0975 Acc: 0.5688
val Loss: 0.9525 Acc: 0.6301

Epoch 1/4
----------


In [None]:
model_name = 'merged_resnet50'
torch.save(model, f'model/{model_name}.pt')

In [None]:
model = torch.load(f'model/{model_name}.pt')

In [None]:
true_labels, predictions = test_model(model, dataloader_test, normalize=True)
c_matrix = confusion_matrix(true_labels, predictions)
plt.figure(figsize=(12, 12))
plt.title("Confusion matrix")
sns.heatmap(c_matrix, cmap='Blues', annot=True, xticklabels=classes, yticklabels=classes, fmt='g', cbar=True)
plt.xlabel('predictions')
plt.ylabel('true labels')
plt.show()

In [None]:
# # False prediction
# test(model, dataloader_test, 2, 0) #(potentially malignant skin tumors, non-cancerous skin condition)

In [None]:
# # Correct prediction of autoimmune disorder
# test(model, dataloader_test, 3, 3)

In [None]:
# Trace model

# must be the same size a minibatch with 1 example image
example = torch.rand(1, 3, 224, 224)

# move model back to cpu, do tracing, and optimize
model_conv = model.to('cpu')
traced_script_module = torch.jit.trace(model_conv, example)
torchscript_model_optimized = optimize_for_mobile(traced_script_module)

# save optimized model for mobile
PATH = f'model/{model_name}_traced.pt'
torchscript_model_optimized.save(PATH)

In [None]:
!tar -czf model/model.tar.gz model/merged_resnet50.pt

In [None]:
!aws s3 cp model/model.tar.gz s3://rubyhan-w210-datasets/model/