In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!git clone https://github.com/PaulStryck/nih-chest-x-ray.git ./nih_chest_x_ray
!git -C nih_chest_x_ray pull
!git -C nih_chest_x_ray checkout tags/1.1

In [None]:
from nih_chest_x_ray.modules import net, trainer
import torch
from torchvision import transforms
from nih_chest_x_ray.modules import net, dataset
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from numpy import argmax
import tensorflow as tf
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve
import seaborn as sn

In [None]:
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

data_wrapper = dataset.ChestXRayImages(root  = "../input/data",
                                    folds = 5,
                                    frac  = 0.001,
                                    seed  = 2021)

data_test = dataset.ChestXRayImageDataset(
         "../input/data",
         data_wrapper.data_test,
         transform=transform)

test_loader = torch.utils.data.DataLoader(data_test,batch_size=128)

### Load Model and Predict

In [None]:
from nih_chest_x_ray.modules import net, dataset
import torch

PATH = "../input/resnet-epoch1/model_weights_epoch_1.pth"
model = net.get_model(len(dataset.ChestXRayImageDataset.labels))
model.load_state_dict(torch.load(PATH))
model.eval()

In [None]:
def get_all_preds(model, loader):
    all_preds = torch.tensor([])
    for batch in loader:
        images, labels = batch

        preds = model(images)
        all_preds = torch.cat(
            (all_preds, preds)
            ,dim=0
        )
    return all_preds

### Comment-In to Train and Save Preds
Uncomment "saving" to work with full sample saved predictions, while commenting-out "training". May need to adjust file path below.

In [None]:
#arange ture labels same shape as preds
true_y =[]
for i in data_test:
    inst = np.array(i[1])
    inst = inst.astype('int64')
    true_y.append(inst)
true_y = np.array(true_y)
true_y.shape

In [None]:
#training
with torch.no_grad():
    train_preds = get_all_preds(model, test_loader)

#saving
preds_np = train_preds.numpy()
preds_df = pd.DataFrame(preds_np)
preds_df.to_csv('preds.csv', index=False)

#saving
true_y_df = pd.DataFrame(true_y)
true_y_df.to_csv('true_y.csv', index=False)

preds_np.shape

In [None]:
#loading
#data = pd.read_csv('../input/preds-conf-matrix/preds.csv', sep=',', na_values=".")
#train_preds = np.array(data)
#loading true
#data = pd.read_csv('../input/preds-conf-matrix/true_y.csv', sep=',', na_values=".")
#true_y = np.array(data)

In [None]:
preds_sig = 1/(1 + np.exp(-train_preds))
preds_sig.shape

In [None]:
labels = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
         'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration',
         'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia',
         'Pneumothorax', 'none']

matrices=[]

for i in range(0,15):
    fpr, tpr, thresholds = roc_curve(true_y[:, i], preds_sig[:, i])
    J = tpr - fpr
    ix = argmax(J)
    best_thresh = thresholds[ix]
    pred = np.where(preds_sig[:, i] > best_thresh, 1, 0)
    matrices.append(confusion_matrix(y_pred=pred, y_true=true_y[:, i]))
    
plt.figure(figsize=(24,20))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.4, hspace=0.5)

for i,j,k,m in zip(labels, matrices, range(0,15), range(1,16)):
    plt.subplot(5,5,m)
    sn.set(font_scale=1.2)
    plt.title('{}'.format(labels[k], 4, 4), y=1.0)
    df_cm = pd.DataFrame(matrices[k], index=None, columns=None)
    sn.heatmap(df_cm, annot=True, annot_kws={"size": 15}, fmt='g') 
    sn.set(font_scale=0.8)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


plt.show()

##### You will notice a lot of classifications in the False Positive (FP) upper right square. This is okay, because the doctor can double check the diagnosis. Our model is tuned so the false negatives are minimized (FN) at the bottom left of the confusion matrix. Having many FN means that these will not be checked by the doctor.

### AUC Test

In [None]:
for i in tqdm(data_test):
    real.append(i[1])
    
real

In [None]:
#preds_copy = train_preds.detach().clone()
#sig = tf.nn.sigmoid(preds_copy)

real = []
with tf.device("/device:GPU:0"):
    for i in tqdm(data_test):
        real.append(i[1])
        
labels = data_test.labels

def get_roc_curve(labels, predicted_vals, liste, when = ''):
    auc_roc_vals = []
    for i,j in enumerate(liste):
        try:
            gt = j
            pred = predicted_vals[:, i]
            auc_roc = roc_auc_score(gt, pred)
            auc_roc_vals.append(auc_roc)
            fpr_rf, tpr_rf, _ = roc_curve(gt, pred)
            plt.figure(1, figsize=(10, 10))
            plt.plot([0, 1], [0, 1], 'k--')
            plt.plot(fpr_rf, tpr_rf,
                     label=labels[i] + " (" + str(round(auc_roc, 3)) + ")")
            plt.xlabel('False positive rate')
            plt.ylabel('True positive rate')
            plt.title('ROC curve ' + when)
            plt.legend(loc='best')
        except:
            print(
                f"Error in generating ROC curve for {labels[i]}. "
                f"Dataset lacks enough examples."
            )
    plt.show()
    return auc_roc_vals

In [None]:
for i,j in enumerate(real_vals):
    print(i,j)

In [None]:
pred0 = []
pred1 = []
pred2 = []
pred3 = []
pred4 = []
pred5 = []
pred6 = []
pred7 = []
pred8 = []
pred9 = []
pred10 = []
pred11 = []
pred12 = []
pred13 = []
pred14 = []

for i in tqdm(real):
    pred0.append(i[0])
    pred1.append(i[1])
    pred2.append(i[2])
    pred3.append(i[3])
    pred4.append(i[4])
    pred5.append(i[5])
    pred6.append(i[6])
    pred7.append(i[7])
    pred8.append(i[8])
    pred9.append(i[9])
    pred10.append(i[10])
    pred11.append(i[11])
    pred12.append(i[12])
    pred13.append(i[13])
    pred14.append(i[14])

In [None]:
real_vals = [pred0,pred1,pred2,pred3,pred4,pred5,pred6,pred7,pred8,pred9,pred10,pred11,pred12,pred13,pred14]

In [None]:
get_roc_curve(labels, preds_sig, real_vals, when = '')