In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from numpy import argmax
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve
import seaborn as sn
import tensorflow as tf
import os
import glob
import re
from matplotlib.font_manager import FontProperties


In [None]:
labels = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
         'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration',
         'Mass', 'Nodule', 'Plrl_Thickening', 'Pneumonia',
         'Pneumothorax', 'none']

def plot_matices(true, preds, df_graph):
    matrices=[]

    for i in range(0,15):
        fpr, tpr, thresholds = roc_curve(true.iloc[:, i], preds.iloc[:, i])
        J = tpr - fpr
        ix = argmax(J)
        best_thresh = thresholds[ix]
        pred = np.where(preds.iloc[:, i] > best_thresh, 1, 0)
        matrices.append(confusion_matrix(y_pred=pred, y_true=true.iloc[:, i]))

    plt.figure(figsize=(24,20))
    plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.4, hspace=0.5)
    plt.suptitle('{}'.format(df_graph.columns[0]), fontsize=30)

    for i,j,k,m in zip(labels, matrices, range(0,15), range(1,16)):
        plt.subplot(5,5,m)
        sn.set(font_scale=1.2)
        plt.title('{}'.format(labels[k], 4, 4), y=1.0)
        df_cm = pd.DataFrame(matrices[k], index=None, columns=None)
        sn.heatmap(df_cm, annot=True, annot_kws={"size": 15}, fmt='g') 
        sn.set(font_scale=0.8)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

    plt.show()
    return
#plot_matices(true, preds, df_graph)                      #will only work if df_graph is defined below

##### You will notice a lot of classifications in the False Positive (FP) upper right square. This is okay, because the doctor can double check the diagnosis. Our model is tuned so the false negatives are minimized (FN) at the bottom left of the confusion matrix. Having many FN means that these will not be checked by the doctor.

### AUC Test

verify that same AUC score as in CF-Table above

In [None]:
def get_roc_curve(labels, preds, true, df_graph):
    auc_roc_vals = []
    for i in range(0,15):
        try:
            auc_roc = roc_auc_score(true.iloc[:, i], preds.iloc[:, i])
            auc_roc_vals.append(auc_roc)
            fpr, tpr, thresholds = roc_curve(true.iloc[:, i], preds.iloc[:, i])
            plt.figure(1, figsize=(10, 10))
            plt.plot([0, 1], [0, 1], 'k--')
            plt.plot(fpr, tpr,
                     label=labels[i] + " (" + str(round(auc_roc, 3)) + ")")
            plt.xlabel('False positive rate')
            plt.ylabel('True positive rate')
            plt.title('{}'.format(df_graph.columns[0]), fontsize=30)
            
            plt.legend(loc='best')
        except:
            print(
                f"Error in generating ROC curve for {labels[i]}. "
                f"Dataset lacks enough examples."
            )
    plt.show()
    return auc_roc_vals

#get_roc_curve(labels, preds, true, df_graph)

### Read all CSVs in folder result highest scoring epoch

In [None]:
def load(path):
    # use glob to get all the csv files 
    # in the folder
    #path = "../input/results-training/results/effnet_b0_adam_steplr"
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    
    # loop over the list of csv files
    results = []
    epochs = []
    for f in csv_files:
        if "final" in f: 
            continue        
        data = pd.read_csv(f, sep=',', na_values=".")
        epochs.append(f[-9])
        true = data.iloc[:, 1:16]
        preds = data.iloc[:, 16:31]

        auc_roc_vals = []
        for i in range(0,15):
                auc_roc = roc_auc_score(true.iloc[:, i], preds.iloc[:, i])
                auc_roc_vals.append(round(auc_roc,4))
        result=np.multiply(auc_roc_vals, 100)
        results.append(result)

    results=np.array(np.transpose(results))
    results.shape
    df = pd.DataFrame(results, columns=epochs)                     #all epochs

    name=df.sum(axis=0).index[np.argmax(df.sum(axis=0))]           #return name of highest scoring epoch
    df_graph = round(pd.DataFrame(df[name]),1)
    df_graph = df_graph.rename(columns={name: path[39:]})
    if "0" in epochs:
        epochs.remove("0")
    return true, preds, epochs, df_graph, df                                              

#true, preds, epochs, df_graph, df = load("../input/last-data-set-nn-xray/results/googlenet_adam_steplr_2")

### Epoch-Graphs

In [None]:
#Sort columns
def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    '''
    return [ atoi(c) for c in re.split(r'(\d+)', text) ]

#Plot Epoch AUCs
def plot(df, epochs, df_graph):
    #sort epochs
    epochs.sort(key=natural_keys)
    df = df[epochs]

    plt.figure(figsize=(15,10))
    plt.plot(np.transpose(df), label = labels, marker = 's', linewidth = 1, ms = 5)
    plt.title('{}'.format(df_graph.columns[0]))
    plt.xlabel('Epochs')
    plt.ylabel('AUC')
    plt.legend(fontsize = 'large')
    plt.legend(loc='best')
    plt.show()
    return

In [None]:
d = "../input/last-data-set-nn-xray/results"

paths = []
for path in os.listdir(d):
    full_path = os.path.join(d, path)
    paths.append(full_path)
    
paths

best_epoch = []
for i in paths:
    true, preds, epochs, df_graph, df = load(i)
    plot(df,epochs, df_graph)
    best_epoch.append(df_graph)
appended_data = pd.concat(best_epoch, axis=1)
s = appended_data.sum()
appended_data=appended_data[s.sort_values(ascending=True).index[:]]
appended_data = pd.concat([pd.DataFrame(labels), appended_data.reset_index(drop=True)], axis=1)
appended_data = appended_data.rename(columns={0: "Pathology"})

### Table

In [None]:
#Plot Table

fig, ax = plt.subplots()
ax.axis('off')
ax.axis('tight')
t= ax.table(cellText=appended_data.values, colWidths = None,  colLabels=appended_data.columns.str.wrap(13),  loc='center', cellLoc='left')
t.auto_set_font_size(False) 
t.scale(4.5, 4.5)  # may help

#bold fonts
for (row, col), cell in t.get_celld().items():
  if (row == 0) or (col == 0):
    cell.set_text_props(fontproperties=FontProperties(weight='bold'))

t.set_fontsize(14)
fig.tight_layout()
plt.show()
plt.savefig('results_table.png')

### TP/FP Matrices and ROC Curves

In [None]:
for i in paths:
    true, preds, epochs, df_graph, df = load(i)
    plot_matices(true, preds, df_graph)
    get_roc_curve(labels, preds, true, df_graph)