`# ResNet's diagnostic performance

This notebook contains the code to compute the F1, precision, recall and specificity scores for model predictions on the neurips2021 test set. It is assumed that the predictions have already been made and that they are available in a .csv file (which are actually pickle files).

Note: Make sure to use the tf2.5 (or tf2.3 on neurips2021-{1-3}) venv/kernel. Otherwise the pandas version might be too old to read the result files.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import classification_report, recall_score, precision_score, confusion_matrix, accuracy_score


In [2]:
# Set paths
base_model_pred_path = Path('/home/ubuntu/store/resnet-final-size')
model_pred_fn = [Path('resnet50_' + str(i) + '_preds.csv') for i in range(5)]
model_pred_fn

[PosixPath('resnet50_0_preds.csv'),
 PosixPath('resnet50_1_preds.csv'),
 PosixPath('resnet50_2_preds.csv'),
 PosixPath('resnet50_3_preds.csv'),
 PosixPath('resnet50_4_preds.csv')]

In [3]:
dfs = [pd.read_pickle(base_model_pred_path / model_pred_fn[i]) for i in range(5)]

In [4]:
dfs[0]

Unnamed: 0,actual,pred,filenames
0,0,3,Acne/032439HB.jpeg
1,0,2,Acne/032462HB.jpeg
2,0,3,Acne/032653HB.jpeg
3,0,5,Acne/032720HB.jpeg
4,0,0,Acne/032879HB.jpeg
...,...,...,...
561,5,4,Vitiligo/vitiligo-66--WatermarkedWyJXYXRlcm1hc...
562,5,0,Vitiligo/vitiligo1--WatermarkedWyJXYXRlcm1hcmt...
563,5,4,Vitiligo/vitiligo2--WatermarkedWyJXYXRlcm1hcmt...
564,5,2,Vitiligo/vitiligo3--WatermarkedWyJXYXRlcm1hcmt...


In [5]:
# Filter so that only 525 included
include_images = pd.read_csv('./include_images_525.csv')
include_images.head()

Unnamed: 0.1,Unnamed: 0,image_id
0,0,005103HB.json
1,1,016003HB.json
2,2,016013HB.json
3,3,016139HB.json
4,4,016263HB.json


In [6]:
for i in range(len(dfs)):
    dfs[i] = dfs[i].loc[[True if filename in include_images.image_id.apply(lambda x: x.split('.')[0]).values else False for filename in dfs[i].filenames.apply(lambda x: x.split('/')[1].split('.')[0]).values ]]

dfs[0]

Unnamed: 0,actual,pred,filenames
0,0,3,Acne/032439HB.jpeg
1,0,2,Acne/032462HB.jpeg
2,0,3,Acne/032653HB.jpeg
3,0,5,Acne/032720HB.jpeg
4,0,0,Acne/032879HB.jpeg
...,...,...,...
561,5,4,Vitiligo/vitiligo-66--WatermarkedWyJXYXRlcm1hc...
562,5,0,Vitiligo/vitiligo1--WatermarkedWyJXYXRlcm1hcmt...
563,5,4,Vitiligo/vitiligo2--WatermarkedWyJXYXRlcm1hcmt...
564,5,2,Vitiligo/vitiligo3--WatermarkedWyJXYXRlcm1hcmt...


In [7]:
# Compute the precision, recall and f1-score for each class. The specificity and NPV scores are not a part of the
# "classification_report" function and will be computed separately in a cell below.
per_model_res = []
class_names = ['acne',
               'actinic_keratosis',
               'psoriasis_no_pustular',
               'seborrheic_dermatitis',
               'vitiligo',
               'wart'
              ]

for i in range(5):
    per_model_res.append(
        classification_report(
            dfs[i].actual, 
            dfs[i].pred,
            labels=[0, 1, 2, 3, 4, 5],
            target_names=class_names,
        output_dict=True
        )#['macro avg']
    )


In [8]:
per_model_res[0]

{'acne': {'precision': 0.7407407407407407,
  'recall': 0.40404040404040403,
  'f1-score': 0.5228758169934641,
  'support': 99},
 'actinic_keratosis': {'precision': 0.7916666666666666,
  'recall': 0.2087912087912088,
  'f1-score': 0.33043478260869563,
  'support': 91},
 'psoriasis_no_pustular': {'precision': 0.3514644351464435,
  'recall': 0.865979381443299,
  'f1-score': 0.5,
  'support': 97},
 'seborrheic_dermatitis': {'precision': 0.5287356321839081,
  'recall': 0.5897435897435898,
  'f1-score': 0.5575757575757575,
  'support': 78},
 'vitiligo': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 74},
 'wart': {'precision': 0.21568627450980393,
  'recall': 0.2558139534883721,
  'f1-score': 0.23404255319148937,
  'support': 86},
 'accuracy': 0.40190476190476193,
 'macro avg': {'precision': 0.43804895820792716,
  'recall': 0.38739475625114567,
  'f1-score': 0.35748815172823445,
  'support': 525},
 'weighted avg': {'precision': 0.4557284745283676,
  'recall': 0.401904761904761

In [9]:
# Calculate the specificity values.
for i in range(5):  # For each model
    for j in range(6):  # For each class
        cur_class = class_names[j]
        y_true = np.asarray(list(dfs[i].actual)) == j
        y_pred = np.asarray(list(dfs[i].pred)) == j
    
        # By setting the pos_label to 0, we get the specificity instead of the recall.
        per_model_res[i][cur_class]['specificity'] = recall_score(y_true, y_pred, pos_label=0)
        
        # By setting the pos_label to 0, we get the NPV instead of the precision.
        per_model_res[i][cur_class]['NPV'] = precision_score(y_true, y_pred, pos_label=0)
        
        per_model_res[i][cur_class]['accuracy'] = accuracy_score(dfs[i].actual,dfs[i].pred)

In [10]:
per_model_res[0]

{'acne': {'precision': 0.7407407407407407,
  'recall': 0.40404040404040403,
  'f1-score': 0.5228758169934641,
  'support': 99,
  'specificity': 0.9671361502347418,
  'NPV': 0.8747346072186837,
  'accuracy': 0.40190476190476193},
 'actinic_keratosis': {'precision': 0.7916666666666666,
  'recall': 0.2087912087912088,
  'f1-score': 0.33043478260869563,
  'support': 91,
  'specificity': 0.988479262672811,
  'NPV': 0.8562874251497006,
  'accuracy': 0.40190476190476193},
 'psoriasis_no_pustular': {'precision': 0.3514644351464435,
  'recall': 0.865979381443299,
  'f1-score': 0.5,
  'support': 97,
  'specificity': 0.6378504672897196,
  'NPV': 0.9545454545454546,
  'accuracy': 0.40190476190476193},
 'seborrheic_dermatitis': {'precision': 0.5287356321839081,
  'recall': 0.5897435897435898,
  'f1-score': 0.5575757575757575,
  'support': 78,
  'specificity': 0.9082774049217002,
  'NPV': 0.9269406392694064,
  'accuracy': 0.40190476190476193},
 'vitiligo': {'precision': 0.0,
  'recall': 0.0,
  'f1-s

# Print confusion matrics 

The classes are:  
acne,  
actinic_keratosis,  
psoriasis_no_pustular,  
seborrheic_dermatitis,  
vitiligo,  
wart  


In [65]:
confusion_matrix(dfs[0].actual, dfs[0].pred)

array([[40,  1, 30, 12,  0, 16],
       [ 5, 19, 42, 19,  0,  6],
       [ 0,  1, 84,  5,  0,  7],
       [ 4,  0, 18, 46,  0, 10],
       [ 3,  0, 27,  3,  0, 41],
       [ 2,  3, 38,  2, 19, 22]])

In [66]:
confusion_matrix(dfs[1].actual, dfs[1].pred)

array([[42, 10,  9, 28,  1,  9],
       [ 4, 40,  9, 30,  5,  3],
       [ 5,  8, 40, 27,  8,  9],
       [ 5,  7,  8, 56,  0,  2],
       [ 6,  3, 13,  4,  3, 45],
       [ 4,  2, 10,  5, 51, 14]])

In [67]:
confusion_matrix(dfs[2].actual, dfs[2].pred)

array([[22,  4, 65,  7,  0,  1],
       [ 2, 19, 50, 19,  0,  1],
       [ 0,  5, 82,  5,  4,  1],
       [ 1,  2, 35, 38,  0,  2],
       [ 0,  2, 38,  2,  2, 30],
       [ 0,  4, 40,  5, 27, 10]])

In [68]:
confusion_matrix(dfs[3].actual, dfs[3].pred)

array([[61,  4, 26,  1,  1,  6],
       [ 4, 16, 58,  0,  4,  9],
       [ 4,  0, 84,  1,  3,  5],
       [ 9,  0, 57,  4,  2,  6],
       [ 5,  1, 26,  1,  4, 37],
       [ 1,  2, 15,  2, 60,  6]])

In [70]:
confusion_matrix(dfs[4].actual, dfs[4].pred)

array([[47,  1, 41,  1,  1,  8],
       [ 3, 12, 69,  3,  1,  3],
       [ 3,  1, 87,  0,  1,  5],
       [ 3,  2, 53, 14,  0,  6],
       [ 2,  1, 35,  0,  3, 33],
       [ 4,  4, 37,  0, 32,  9]])

# Compute the micro-avg and std for each class across the 5 models

In [71]:
# Create (nested) dict where each of the scores (across the models) for each metric are put into a numpy array.
metrics = ['f1-score', 'precision', 'recall', 'specificity', 'NPV', 'accuracy']
class_scores = {c: {m:np.zeros(5) for m in metrics} for c in class_names}
for i in range(5):
    for c in class_names:
        for m in metrics:
            class_scores[c][m][i] = per_model_res[i][c][m]

In [72]:
# Calculate the mean (micro-avg) and std for each class and each metric.
avg_std_res = {c: {m:() for m in metrics} for c in class_names}
for c in class_names:
    for m in metrics:
        avg_std_res[c][m] = (np.mean(class_scores[c][m]), np.std(class_scores[c][m]))

In [73]:
for c in class_names:
    print(c)
    for m in metrics:
        print(m, avg_std_res[c][m])
    print('\n')

acne
f1-score (0.5274646068210955, 0.10267909351083032)
precision (0.7482718738847771, 0.07812292207490344)
recall (0.42828282828282827, 0.1269034244407225)
specificity (0.9629107981220658, 0.017778494111513145)
NPV (0.8796145570220861, 0.021928143049660523)
accuracy (0.3527619047619048, 0.029370415645556407)


actinic_keratosis
f1-score (0.32430585192869194, 0.09429718291429502)
precision (0.6315907522429262, 0.09771824168800115)
recall (0.23296703296703297, 0.10708507881388864)
specificity (0.9686635944700461, 0.021097738511077616)
NPV (0.8581623654147658, 0.015474009337586555)
accuracy (0.3527619047619048, 0.029370415645556407)


psoriasis_no_pustular
f1-score (0.4422280620385311, 0.03514969438153134)
precision (0.330278915102766, 0.06751824649907774)
recall (0.777319587628866, 0.18321533931153042)
specificity (0.6032710280373832, 0.15707747742953415)
NPV (0.9309179867587936, 0.031966701116977005)
accuracy (0.3527619047619048, 0.029370415645556407)


seborrheic_dermatitis
f1-score (