In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from   pathlib import Path, PurePath
from   scipy.ndimage import gaussian_filter1d as g1d
import seaborn as sns

from   dl_rad_age.evaluation import create_results_folder_name, evaluate_run, filter_runs, get_model_predictions, get_run_metrics, get_run_result, get_runs

In [None]:
RESULTS_TEST_SET_CSV  = '../results/test_set/ensemble_results_test_set.csv'
RESULTS_KH_MALE_CSV   = '../results/kh_ae_male.csv'
RESULTS_KH_FEMALE_CSV = '../results/kh_ae_female.csv'

### Analysis

In [None]:
# Load results from disk
df_results_dl        = pd.read_csv(RESULTS_TEST_SET_CSV)
df_results_kh_male   = pd.read_csv(RESULTS_KH_MALE_CSV)
df_results_kh_female = pd.read_csv(RESULTS_KH_FEMALE_CSV)

In [None]:
# Prepare data for analysis
y_true       = df_results_dl['y_true'].to_numpy()
abs_error_dl = df_results_dl['error'].to_numpy()
sex          = df_results_dl['sex'].to_numpy()
uncertainty  = df_results_dl['uncertainty'].to_numpy()
image_files  = df_results_dl['image_file'].to_numpy()
model_preds  = df_results_dl.iloc[:,6:].to_numpy()
sd_preds     = np.std(model_preds, axis=1)

y_true_m       = y_true[sex==0.0]
y_true_f       = y_true[sex==1.0]
abs_error_dl_m = abs_error_dl[sex==0.0]
abs_error_dl_f = abs_error_dl[sex==1.0]
uncertainty_m  = uncertainty[sex==0.0]
uncertainty_f  = uncertainty[sex==1.0]
image_files_m  = image_files[sex==0.0]
image_files_f  = image_files[sex==1.0]
sd_preds_m     = sd_preds[sex==0.0]
sd_preds_f     = sd_preds[sex==1.0]

abs_error_kh_m = []
abs_error_kh_f = []

for age_, sex_ in zip(y_true, sex):
    # Round true age to match Kellinghaus table
    age_ = np.round(age_, decimals=2)
    
    if sex_==0.0:
        idx   = np.where(df_results_kh_male['age']==age_)[0][0]
        error = df_results_kh_male.loc[idx, 'ae']
        abs_error_kh_m.append(error)
    elif sex_==1.0:
        idx   = np.where(df_results_kh_female['age']==age_)[0][0]
        error = df_results_kh_female.loc[idx, 'ae']
        abs_error_kh_f.append(error)
    else:
        raise ValueError('Bad sex value <{}>.'.format(sex_))

abs_error_kh_m = np.asarray(abs_error_kh_m)
abs_error_kh_f = np.asarray(abs_error_kh_f)
abs_error_kh   = np.concatenate([abs_error_kh_m,abs_error_kh_f])

abs_error_kh_m.shape, abs_error_kh_f.shape, abs_error_kh.shape

### Global performance

In [None]:
# Deep learning
mae_dl     = np.mean(abs_error_dl)
mae_sd_dl  = np.std(abs_error_dl)
max_ae_dl  = np.max(abs_error_dl)
p90_ae_dl  = np.percentile(abs_error_dl, q=90.0)
mean_sd_dl = np.mean(sd_preds)

# Deep learning - male / female
mae_dl_male       = np.mean(abs_error_dl_m)
mae_dl_female     = np.mean(abs_error_dl_f)
mae_sd_dl_male    = np.std(abs_error_dl_m)
mae_sd_dl_female  = np.std(abs_error_dl_f)
max_ae_dl_male    = np.max(abs_error_dl_m)
max_ae_dl_female  = np.max(abs_error_dl_f)
p90_ae_dl_male    = np.percentile(abs_error_dl_m, q=90)
p90_ae_dl_female  = np.percentile(abs_error_dl_f, q=90)
mean_sd_dl_male   = np.mean(sd_preds_m)
mean_sd_dl_female = np.mean(sd_preds_f)

# Kellinghaus
mae_kh    = np.mean(abs_error_kh)
mae_sd_kh = np.std(abs_error_kh)
max_ae_kh = np.max(abs_error_kh)
p90_ae_kh = np.percentile(abs_error_kh, q=90.0)

# Kellinghaus - male / female
mae_kh_male      = np.mean(abs_error_kh_m)
mae_kh_female    = np.mean(abs_error_kh_f)
mae_sd_kh_male    = np.std(abs_error_kh_m)
mae_sd_kh_female  = np.std(abs_error_kh_f)
max_ae_kh_male   = np.max(abs_error_kh_m)
max_ae_kh_female = np.max(abs_error_kh_f)
p90_ae_kh_male   = np.percentile(abs_error_kh_m, q=90)
p90_ae_kh_female = np.percentile(abs_error_kh_f, q=90)

In [None]:
print('AgeNet (Deep Learning):')
print('\tMAE = {:.2f} +/- {:.2f}'.format(mae_dl, mae_sd_dl))
print('\t\tmale   = {:.2f} +/- {:.2f}'.format(mae_dl_male, mae_sd_dl_male))
print('\t\tfemale = {:.2f} +/- {:.2f}'.format(mae_dl_female, mae_sd_dl_female))
print('\tmax error = {:.2f}'.format(max_ae_dl))
print('\t\tmale   = {:.2f}'.format(max_ae_dl_male))
print('\t\tfemale = {:.2f}'.format(max_ae_dl_female))
print('\tp90 error = {:.2f}'.format(p90_ae_dl))
print('\t\tmale   = {:.2f}'.format(p90_ae_dl_male))
print('\t\tfemale = {:.2f}'.format(p90_ae_dl_female))
print('\tSD = {:.2f}'.format(mean_sd_dl))
print('\t\tmale   = {:.2f}'.format(mean_sd_dl_male))
print('\t\tfemale = {:.2f}'.format(mean_sd_dl_female))

print('\nStandard method (Kellinghaus):')
print('\tMAE = {:.2f} +/- {:.2f}'.format(mae_kh, mae_sd_kh))
print('\t\tmale   = {:.2f} +/- {:.2f}'.format(mae_kh_male, mae_sd_kh_male))
print('\t\tfemale = {:.2f} +/- {:.2f}'.format(mae_kh_female, mae_sd_kh_female))
print('\tmax error = {:.2f}'.format(max_ae_kh))
print('\t\tmale   = {:.2f}'.format(max_ae_kh_male))
print('\t\tfemale = {:.2f}'.format(max_ae_kh_female))
print('\tp90 error = {:.2f}'.format(p90_ae_kh))
print('\t\tmale   = {:.2f}'.format(p90_ae_kh_male))
print('\t\tfemale = {:.2f}'.format(p90_ae_kh_female))

### Best / worst predictions

#### Deep learning

In [None]:
id_best_dl_pred_m  = np.argmin(abs_error_dl_m)
id_best_dl_pred_f  = np.argmin(abs_error_dl_f)
id_worst_dl_pred_m = np.argmax(abs_error_dl_m)
id_worst_dl_pred_f = np.argmax(abs_error_dl_f)


print('Best prediction')
print('\tFemale')
print('\t\tAbs error = {:.3f} +/- {:.3f}'.format(abs_error_dl_f[id_best_dl_pred_f], sd_preds_f[id_best_dl_pred_f]))
print('\t\tTrue age = {:.3f}'.format(y_true_f[id_best_dl_pred_f]))
print('\t\tImage file = {:s}'.format(image_files_f[id_best_dl_pred_f]))
print('\tMale')
print('\t\tAbs error = {:.3f} +/- {:.3f}'.format(abs_error_dl_m[id_best_dl_pred_m], sd_preds_m[id_best_dl_pred_m]))
print('\t\tTrue age = {:.3f}'.format(y_true_m[id_best_dl_pred_m]))
print('\t\tImage file = {:s}'.format(image_files_m[id_best_dl_pred_m]))

print('Worst prediction')
print('\tFemale')
print('\t\tAbs error = {:.2f} +/- {:.2f}'.format(abs_error_dl_f[id_worst_dl_pred_f], sd_preds_f[id_worst_dl_pred_f]))
print('\t\tTrue age = {:.2f}'.format(y_true_f[id_worst_dl_pred_f]))
print('\t\tImage file = {:s}'.format(image_files_f[id_worst_dl_pred_f]))
print('\tMale')
print('\t\tAbs error = {:.2f} +/- {:.2f}'.format(abs_error_dl_m[id_worst_dl_pred_m], sd_preds_m[id_worst_dl_pred_m]))
print('\t\tTrue age = {:.2f}'.format(y_true_m[id_worst_dl_pred_m]))
print('\t\tImage file = {:s}'.format(image_files_m[id_worst_dl_pred_m]))

#### Standard method

In [None]:
id_best_kh_pred_m  = np.argmin(abs_error_kh_m)
id_best_kh_pred_f  = np.argmin(abs_error_kh_f)
id_worst_kh_pred_m = np.argmax(abs_error_kh_m)
id_worst_kh_pred_f = np.argmax(abs_error_kh_f)


print('Best prediction')
print('\tFemale')
print('\t\tAbs error = {:.3f}'.format(abs_error_kh_f[id_best_kh_pred_f]))
print('\t\tTrue age = {:.3f}'.format(y_true_f[id_best_kh_pred_f]))
print('\t\tImage file = {:s}'.format(image_files_f[id_best_kh_pred_f]))
print('\tMale')
print('\t\tAbs error = {:.3f}'.format(abs_error_kh_m[id_best_kh_pred_m]))
print('\t\tTrue age = {:.3f}'.format(y_true_m[id_best_kh_pred_m]))
print('\t\tImage file = {:s}'.format(image_files_m[id_best_kh_pred_m]))

print('Worst prediction')
print('\tFemale')
print('\t\tAbs error = {:.2f}'.format(abs_error_kh_f[id_worst_kh_pred_f]))
print('\t\tTrue age = {:.2f}'.format(y_true_f[id_worst_kh_pred_f]))
print('\t\tImage file = {:s}'.format(image_files_f[id_worst_kh_pred_f]))
print('\tMale')
print('\t\tAbs error = {:.2f}'.format(abs_error_kh_m[id_worst_kh_pred_m]))
print('\t\tTrue age = {:.2f}'.format(y_true_m[id_worst_kh_pred_m]))
print('\t\tImage file = {:s}'.format(image_files_m[id_worst_kh_pred_m]))

### Binned performance

In [None]:
# Set bins
bins = np.linspace(15, 30, num=16)
bins

In [None]:
def analysis(y_true, abs_error_dl, abs_error_kh, bins, dec=2):
    bin_inds   = np.digitize(y_true_m, bins, right=False)

    # Bin age (sanity check)
    age_binned = [y_true[bin_inds==i] for i in np.unique(bin_inds)]

    # Bin deep learning results
    ens_ae_binned = [abs_error_dl[bin_inds==i] for i in np.unique(bin_inds)]

    # Bin Kellinghaus results
    kh_ae_binned = [abs_error_kh[bin_inds==i] for i in np.unique(bin_inds)]

    # Mean AE
    mean_ae_dl = [np.round(np.mean(x), decimals=dec) for x in ens_ae_binned]
    mean_ae_kh = [np.round(np.mean(x), decimals=dec) for x in kh_ae_binned]

    # Max AE
    max_ae_dl = [np.round(np.max(x), decimals=dec) for x in ens_ae_binned]
    max_ae_kh = [np.round(np.max(x), decimals=dec) for x in kh_ae_binned]

    # p90 AE
    p90_ae_dl = [np.round(np.percentile(x, q=90), decimals=dec) for x in ens_ae_binned]
    p90_ae_kh = [np.round(np.percentile(x, q=90), decimals=dec) for x in kh_ae_binned]

    # Build dataframe
    age_column      = ['{:.1f}-{:.1f}'.format(float(bins[x]), float(bins[x+1])) for x in range(len(bins)-1)]
    columns_per_age = ['age', 'mae_dl', 'max_dl', 'p90_dl', 'mae_kh', 'max_kh', 'p90_kh']
    data_per_age    = np.array([age_column, mean_ae_dl, max_ae_dl, p90_ae_dl, mean_ae_kh, max_ae_kh, p90_ae_kh]).swapaxes(0,1)

    df_results_per_age = pd.DataFrame(data=data_per_age, columns=columns_per_age)
    
    return df_results_per_age

In [None]:
df_results_per_age_m = analysis(y_true_m, abs_error_dl_m, abs_error_kh_m, bins)
df_results_per_age_m.to_csv('../results/test_set/results_per_age_m.csv', index=False)
df_results_per_age_m

In [None]:
df_results_per_age_f = analysis(y_true_f, abs_error_dl_f, abs_error_kh_f, bins)
df_results_per_age_f.to_csv('../results/test_set/results_per_age_f.csv', index=False)
df_results_per_age_f

### Outlier Performance

In [None]:
abstention_rate = 0.5

In [None]:
# Idenfity samples wiht low uncertainty
ref_unc_val = np.percentile(uncertainty, q=(1-abstention_rate)*100)

# Select low uncertainty samples
y_true_lu       = y_true[uncertainty<ref_unc_val]
abs_error_dl_lu = abs_error_dl[uncertainty<ref_unc_val]
sex_lu          = sex[uncertainty<ref_unc_val]
uncertainty_lu  = uncertainty[uncertainty<ref_unc_val]

# Separate by sex
y_true_m_lu       = y_true_lu[sex_lu==0.0]
y_true_f_lu       = y_true_lu[sex_lu==1.0]
abs_error_dl_m_lu = abs_error_dl_lu[sex_lu==0.0]
abs_error_dl_f_lu = abs_error_dl_lu[sex_lu==1.0]
uncertainty_m_lu  = uncertainty_lu[sex_lu==0.0]
uncertainty_f_lu  = uncertainty_lu[sex_lu==1.0]

In [None]:
# Deep learning
mae_dl_lu    = np.mean(abs_error_dl_lu)
mae_sd_dl_lu = np.std(abs_error_dl_lu)
max_ae_dl_lu = np.max(abs_error_dl_lu)
p90_ae_dl_lu = np.percentile(abs_error_dl_lu, q=90.0)

# Deep learning - male / female
mae_dl_male_lu      = np.mean(abs_error_dl_m_lu)
mae_dl_female_lu    = np.mean(abs_error_dl_f_lu)
mae_sd_dl_male_lu   = np.std(abs_error_dl_m_lu)
mae_sd_dl_female_lu = np.std(abs_error_dl_f_lu)
max_ae_dl_male_lu   = np.max(abs_error_dl_m_lu)
max_ae_dl_female_lu = np.max(abs_error_dl_f_lu)
p90_ae_dl_male_lu   = np.percentile(abs_error_dl_m_lu, q=90)
p90_ae_dl_female_lu = np.percentile(abs_error_dl_f_lu, q=90)

In [None]:
print('Deep Learning:')
print('\tMAE = {:.2f} +/- {:.2f}'.format(mae_dl_lu, mae_sd_dl_lu))
print('\t\tmale   = {:.2f} +/- {:.2f}'.format(mae_dl_male_lu, mae_sd_dl_male_lu))
print('\t\tfemale = {:.2f} +/- {:.2f}'.format(mae_dl_female_lu, mae_sd_dl_female_lu))
print('\tmax error = {:.2f}'.format(max_ae_dl_lu))
print('\t\tmale   = {:.2f}'.format(max_ae_dl_male_lu))
print('\t\tfemale = {:.2f}'.format(max_ae_dl_female_lu))
print('\tp90 error = {:.2f}'.format(p90_ae_dl_lu))
print('\t\tmale   = {:.2f}'.format(p90_ae_dl_male_lu))
print('\t\tfemale = {:.2f}'.format(p90_ae_dl_female_lu))