In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os,sys
sys.path.insert(0,"..")
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision
import tqdm
import sklearn, sklearn.metrics

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torchxrayvision as xrv

In [4]:
print(xrv.datasets.NIH_Google_Dataset.__doc__)

A relabelling of a subset of images from the NIH dataset.  The data tables should
    be applied against an NIH download.  A test and validation split are provided in the
    original.  They are combined here, but one or the other can be used by providing
    the original csv to the csvpath argument.

    Chest Radiograph Interpretation with Deep Learning Models: Assessment with
    Radiologist-adjudicated Reference Standards and Population-adjusted Evaluation
    Anna Majkowska, Sid Mittal, David F. Steiner, Joshua J. Reicher, Scott Mayer
    McKinney, Gavin E. Duggan, Krish Eswaran, Po-Hsuan Cameron Chen, Yun Liu,
    Sreenivasa Raju Kalidindi, Alexander Ding, Greg S. Corrado, Daniel Tse, and
    Shravya Shetty. Radiology 2020

    https://pubs.rsna.org/doi/10.1148/radiol.2019191293

    NIH data can be downloaded here:
    https://academictorrents.com/details/e615d3aebce373f1dc8bd9d11064da55bdadede0
    


In [17]:
# Use XRV transforms to crop and resize the images
transforms = torchvision.transforms.Compose([xrv.datasets.XRayCenterCrop(),
                                             xrv.datasets.XRayResizer(224)])

# Load Google dataset and PyTorch dataloader
dataset = xrv.datasets.NIH_Google_Dataset(imgpath="/Users/ieee8023/Datasets/NIH/images-224",
                                          transform=transforms)

dataset = xrv.datasets.CheX_Dataset(transform=transforms)

# Load pre-trained model and erase classifier
model = xrv.models.DenseNet(weights="densenet121-res224-chex")
xrv.datasets.relabel_dataset(model.pathologies, dataset)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

model.op_threshs = None # prevent pre-trained model calibration
model.classifier = torch.nn.Linear(1024,1) # reinitialize classifier

optimizer = torch.optim.Adam(model.classifier.parameters()) # only train classifier
criterion = torch.nn.BCEWithLogitsLoss()



Setting XRayResizer engine to cv2 could increase performance.
No. of data points in csv 223414
{'Pleural Other', 'Support Devices'} will be dropped
 doesn't exist. Adding nans instead.
 doesn't exist. Adding nans instead.
 doesn't exist. Adding nans instead.
 doesn't exist. Adding nans instead.
 doesn't exist. Adding nans instead.
 doesn't exist. Adding nans instead.
 doesn't exist. Adding nans instead.


In [19]:
# training loop (can run on cpu)
for i, batch in enumerate(dataloader):
    if i > 20: break
    outputs = model(batch["img"])
    # print(outputs)
    print(batch["lab"])
    targets = batch["lab"][:, dataset.pathologies.index("Effusion"), None]
    print(targets)
    loss = criterion(outputs, targets)
    print(i, loss.detach().cpu().numpy())
    loss.backward()
    optimizer.step()

tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0., nan, nan, nan, 1., nan, nan, nan],
        [0., 0., nan, 0., 0., nan, nan, 0., 0., nan, 0., nan, nan, nan, 0., 0., 0., 0.],
        [0., 0., nan, 0., 0., nan, nan, 0., 0., nan, 0., nan, nan, nan, 0., 0., 0., 0.],
        [nan, nan, nan, 0., 0., nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0., nan, nan, nan, nan, 1., nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 1., nan],
        [0., 0., nan, 0., 0., nan, nan, 0., 0., nan, 0., nan, nan, nan, 0., 0., 0., 0.],
        [nan, 0., nan, nan, nan, nan, nan, 0., nan, nan, 1., nan, nan, nan, nan, nan, nan, 1.]],
       dtype=torch.float64)
tensor([[nan],
        [0.],
        [0.],
        [nan],
        [nan],
        [nan],
        [0.],
        [0.]], dtype=torch.float64)
0 nan
tensor([[nan, 0., nan, 0., 1., nan, nan, 1., 1., nan, nan, nan, nan, na

In [128]:
sample = dataset[0]

In [129]:
out = model(torch.from_numpy(sample["img"]).unsqueeze(0))
out = torch.sigmoid(out)

In [130]:
out

tensor([[0.3993]], grad_fn=<SigmoidBackward>)

In [131]:
labels = []
preds = []
with torch.inference_mode():
    for i in range(20):
        sample = dataset[i]
        label = sample["lab"][dataset.pathologies.index("Lung Opacity")]
        labels.append(label)
        pred = model(torch.from_numpy(sample["img"]).unsqueeze(0))
        pred = torch.sigmoid(pred).detach().numpy()[0][0]
        preds.append(pred)
        print(label, pred)

0.0 0.399319
1.0 0.4504822
1.0 0.46033397
0.0 0.33570313
0.0 0.39914772
0.0 0.4095046
0.0 0.42468312
0.0 0.39061117
1.0 0.43495628
1.0 0.38757616
0.0 0.45992658
0.0 0.42239773
1.0 0.47049743
0.0 0.37503645
0.0 0.39930966
0.0 0.42051744
0.0 0.37670723
0.0 0.3471031
0.0 0.3745227
1.0 0.4336192


In [132]:
sklearn.metrics.roc_auc_score(labels, preds)

0.8571428571428572