In [19]:
'''Use multiple rounds to get a more robust results'''
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, balanced_accuracy_score
import matplotlib.pyplot as plt
import torchvision
import shutil
import os

In [38]:
def cal_metrics(csv_path, type_indices, is_binary=False):
    '''
    calculate average accuracy, accuracy per skin type, PQD, DPM, EOM.
    All known skin types
    input val results csv path, type_indices: a list
    output a dic, 'acc_avg': value, 'acc_per_type': array[x,x,x], 'PQD', 'DPM', 'EOM'
    '''
    df = pd.read_csv(csv_path)
    labels_array = np.zeros((6, len(df['label'].unique())))
    correct_array = np.zeros((6, len(df['label'].unique())))
    predictions_array = np.zeros((6, len(df['label'].unique())))
    positive_list = []  # get positive probability for binary classification
    for i in range(df.shape[0]):
        prediction = df.iloc[i]['prediction']
        label = df.iloc[i]['label']
        type = df.iloc[i]['fitzpatrick']-1
        labels_array[int(type), int(label)] += 1
        predictions_array[int(type),int(prediction)] += 1
        if prediction == label:
            correct_array[int(type), int(label)] += 1

        if is_binary:
            if prediction == 0:
                positive_list.append(1.0-df.iloc[i]['prediction_probability'])
            else:
                positive_list.append(df.iloc[i]['prediction_probability'])
    
    correct_array = correct_array[type_indices]
    labels_array = labels_array[type_indices]
    predictions_array = predictions_array[type_indices]

    # avg acc, acc per type
    correct_array_sumc, labels_array_sumc = np.sum(correct_array, axis=1), np.sum(labels_array, axis=1)  # sum skin conditions
    acc_array = correct_array_sumc/labels_array_sumc
    avg_acc = np.sum(correct_array)/np.sum(labels_array)

    # PQD
    PQD = acc_array.min()/acc_array.max()

    # DPM
    demo_array = predictions_array/np.sum(predictions_array, axis=1, keepdims=True)
    DPM = np.mean(demo_array.min(axis=0)/demo_array.max(axis=0))

    # EOM
    eo_array = correct_array/labels_array
    EOM = np.mean(np.min(eo_array,axis=0)/np.max(eo_array, axis=0))

    # if is binary classification, output AUC
    if is_binary:
        fpr, tpr, threshold = roc_curve(df['label'], positive_list,drop_intermediate=True)
        AUC = auc(fpr, tpr)
    else:
        AUC = -1

    return {'acc_avg': avg_acc, 'acc_per_type': acc_array, 'PQD': PQD, 'DPM': DPM, 'EOM': EOM, 'AUC': AUC}

In [53]:
epoch = 15
label = 'high'
holdout_set = 'random_holdout' # dermaamin br
model_name = 'DisCo'
type_indices = [0, 1, 2]
csv_folder_list = ['S36', 'S37', 'S38', 'S39', 'S40']
is_binary = True

avg_array = np.zeros((len(csv_folder_list)))
acc_per_type_array = np.zeros((len(csv_folder_list), len(type_indices)))
PQD_array = np.zeros((len(csv_folder_list)))
DPM_array = np.zeros((len(csv_folder_list)))
EOM_array = np.zeros((len(csv_folder_list)))
AUC_array = np.zeros((len(csv_folder_list)))

for i in range(len(csv_folder_list)):
    csv_path = 'results/{}/results_{}_{}_{}_{}.csv'.format(csv_folder_list[i], model_name, epoch,label,holdout_set)
    dic = cal_metrics(csv_path, type_indices, is_binary)
    avg_array[i] = dic['acc_avg']
    acc_per_type_array[i, :] = dic['acc_per_type']
    PQD_array[i] = dic['PQD']
    DPM_array[i] = dic['DPM']
    EOM_array[i] = dic['EOM']
    AUC_array[i] = dic['AUC']


print('acc_avg array')
print(avg_array)
print('acc per type')
print(acc_per_type_array)
print('PQD')
print(PQD_array)
print('DPM')
print(DPM_array)
print('EOM')
print(EOM_array)
print('AUC')
print(AUC_array)

acc_avg array
[0.83333333 0.79545455 0.79545455 0.81818182 0.82575758]
acc per type
[[0.83783784 0.84313725 0.81818182]
 [0.78947368 0.72340426 0.87234043]
 [0.79069767 0.76086957 0.8372093 ]
 [0.88372093 0.75555556 0.81818182]
 [0.81818182 0.73913043 0.92857143]]
PQD
[0.97040169 0.82926829 0.90881643 0.85497076 0.79598662]
DPM
[0.83619984 0.63778929 0.62222222 0.81963342 0.82633372]
EOM
[0.83455882 0.63591954 0.68428571 0.72479947 0.46153846]
AUC
[0.84771723 0.72021718 0.7974537  0.81539444 0.72934579]


In [54]:
print('average accuracy mean: {}, std: {}'.format(avg_array.mean(), avg_array.std()))
print('accuracy per skin type mean and std')
print(np.mean(acc_per_type_array, axis=0), np.std(acc_per_type_array, axis=0))
print('PQD mean: {}, std: {}'.format(PQD_array.mean(), PQD_array.std()))
print('DPM mean: {}, std: {}'.format(DPM_array.mean(), DPM_array.std()))
print('EOM mean: {}, std: {}'.format(EOM_array.mean(), EOM_array.std()))
print('AUC mean: {}, std: {}'.format(AUC_array.mean(), AUC_array.std()))

average accuracy mean: 0.8136363636363637, std: 0.015599439607556097
accuracy per skin type mean and std
[0.82398239 0.76441941 0.85489696] [0.03490856 0.04149565 0.04181162]
PQD mean: 0.8718887582886241, std: 0.06153423044630146
DPM mean: 0.7484356965797874, std: 0.09696621137513113
EOM mean: 0.6682204009648229, std: 0.12238343459550506
AUC mean: 0.782025670999462, std: 0.04952239652417176
