# Pseudolabelling NIH ChestX-ray8 Dataset

I have put together a rough notebook showing my approach to pseudolabel the NIH CXRs.

**Steps:**
1. Download the NIH data
2. Remove the images that are in the RANZCR dataset
3. Run inference on the remaining images
4. Round the results based on predetermined confidence level
5. Add labelled images, particularly of scarce classes, to training set

If you found this notebook useful, feel free to upvote :) 

In [None]:
import os
import pandas as pd
import urllib.request
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import cv2
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import timm
from albumentations import *
from albumentations.pytorch import ToTensorV2
device = torch.device('cuda')

# Download the NIH CXRs

This is the location they are stored in by NIH. Kaggle can't hold all of them in its working memory, so you'll have to find a way around that. Mine was using Google Colab.

In [None]:
links = [
    'https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz',
#   'https://nihcc.box.com/shared/static/i28rlmbvmfjbl8p2n3ril0pptcmcu9d1.gz',
#   'https://nihcc.box.com/shared/static/f1t00wrtdk94satdfb9olcolqx20z2jp.gz',
#   'https://nihcc.box.com/shared/static/0aowwzs5lhjrceb3qp67ahp0rd1l1etg.gz',
#   'https://nihcc.box.com/shared/static/v5e3goj22zr6h8tzualxfsqlqaygfbsn.gz',
#   'https://nihcc.box.com/shared/static/asi7ikud9jwnkrnkj99jnpfkjdes7l6l.gz',
#   'https://nihcc.box.com/shared/static/jn1b4mw4n6lnh74ovmcjb8y48h8xj07n.gz',
#   'https://nihcc.box.com/shared/static/tvpxmn7qyrgl0w8wfh9kqfjskv6nmm1j.gz',
#   'https://nihcc.box.com/shared/static/upyy3ml7qdumlgk2rfcvlb9k6gvqq2pj.gz',
#   'https://nihcc.box.com/shared/static/l6nilvfa9cg3s28tqv1qc1olm3gnz54p.gz',
#   'https://nihcc.box.com/shared/static/hhq8fkdgvcari67vfhs7ppg2w6ni4jze.gz',
#   'https://nihcc.box.com/shared/static/ioqwiy20ihqwyr8pf4c24eazhh281pbu.gz'
]

for idx, link in enumerate(links):
    fn = '/kaggle/working/images_%02d.tar.gz' % (idx)
    print('downloading'+fn+'...')
    urllib.request.urlretrieve(link, fn)
    
for i in range(len(links)):
    gzip_file_path = f'/kaggle/working/images_0{i}.tar.gz'
    !gunzip $gzip_file_path
    tar_file_path = f'/kaggle/working/images_0{i}.tar'
    !tar -xf $tar_file_path

In [None]:
def filenames_to(array):
    for root, dirs, files in os.walk("/kaggle/working/images/", topdown=False):
        for name in files:
            image_path = os.path.join(root, name)
            array.append(image_path)

In [None]:
files_arr = []
filenames_to(files_arr)
print(len(files_arr))

# Remove duplicates

[@mohamed3abdelrazik](https://www.kaggle.com/mohamed3abdelrazik) demonstrated that you could use the imagehash library to locate duplicates in the RANZCR training set and the NIH CXRs. I've used his .csv file to locate and remove the duplicates for this notebook.

In [None]:
nih = pd.read_csv('../input/duplicates/duplicates.csv')

for i in range(len(nih)):
    if nih.iloc[i][0] in files_arr:
        duplicate = f'{nih.iloc[i][0]}'
        !rm -rf $duplicate
        print(f'Removing {nih.iloc[i][0]}')

clean_files_arr = []
filenames_to(clean_files_arr)

# Inference on NIH CXRs

Most of this is taken from @underwearfitting's excellent notebook [here](https://www.kaggle.com/underwearfitting/resnet200d-public-benchmark-2xtta-lb0-965). 

You can of course vary the number of models you want to ensemble with; the combination of these five gave LB 0.965.

In [None]:
batch_size = 1
image_size = 512
model_path = ['../input/resnet200d-baseline-benchmark-public/resnet200d_fold0_cv953.pth',
              '../input/resnet200d-baseline-benchmark-public/resnet200d_fold1_cv955.pth',
              '../input/resnet200d-baseline-benchmark-public/resnet200d_fold2_cv955.pth',
              '../input/resnet200d-baseline-benchmark-public/resnet200d_fold3_cv957.pth',
              '../input/resnet200d-baseline-benchmark-public/resnet200d_fold4_cv954.pth'
             ]
enet_type = ['resnet200d'] * len(model_path)
label_cols =['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal',
             'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged', 'NGT - Normal', 
             'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal',
             'Swan Ganz Catheter Present']

In [None]:
class RANZCRResNet200D(nn.Module):
    def __init__(self, model_name='resnet200d', out_dim=11, pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=False)
        n_features = self.model.fc.in_features
        self.model.global_pool = nn.Identity()
        self.model.fc = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(n_features, out_dim)

    def forward(self, x):
        bs = x.size(0)
        features = self.model(x)
        pooled_features = self.pooling(features).view(bs, -1)
        output = self.fc(pooled_features)
        return output

In [None]:
transforms_test = Compose([
    Resize(image_size, image_size),
    Normalize(
         mean=[0.485, 0.456, 0.406],
         std=[0.229, 0.224, 0.225],
     ),
    ToTensorV2()
])

In [None]:
class RANZCRDataset(Dataset):
    def __init__(self, df, mode, transform=None):
        
        self.df = df.reset_index(drop=True)
        self.mode = mode
        self.transform = transform
        self.labels = df[label_cols].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.loc[index]
        img = cv2.imread(row.file_path, cv2.IMREAD_GRAYSCALE)
        mask = img > 0
        img = img[np.ix_(mask.any(1), mask.any(0))] # snippet to remove black borders
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
        
        if self.transform is not None:
            res = self.transform(image=img)
            img = res['image']
        label = torch.tensor(self.labels[index]).float()
        if self.mode == 'test':
            return img
        else:
            return img, label

In [None]:
def inference_func(test_loader):
    model.eval()
    bar = tqdm(test_loader)
    LOGITS = []
    PREDS = []
    
    with torch.no_grad():
        for batch_idx, images in enumerate(bar):
            x = images.to(device)
            logits = model(x)
            LOGITS.append(logits.cpu())
            PREDS += [logits.sigmoid().detach().cpu()]
        PREDS = torch.cat(PREDS).cpu().numpy()
        LOGITS = torch.cat(LOGITS).cpu().numpy()
    return PREDS

# Create Dataset and Dataloader

I've used a very janky for loop to fix up the sample_submission.csv for our purposes. Feel free to suggest something better!

In [None]:
test = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/sample_submission.csv')

for i in range(len(clean_files_arr)):
    test.loc[i] = [0] * 12
    test['StudyInstanceUID'][i] = clean_files_arr[i]

test['file_path'] = test.StudyInstanceUID.apply(lambda x: os.path.join('/kaggle/working/images/', f'{x}'))

test.to_csv(r'./nih_pseudolabels_test.csv', index=False)
test_dataset = RANZCRDataset(test, 'test', transform=transforms_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False,  num_workers=24)

In [None]:
test_preds = []
for i in range(len(enet_type)):
    if enet_type[i] == 'resnet200d':
        print('resnet200d loaded')
        model = RANZCRResNet200D(enet_type[i], out_dim=len(label_cols))
        model = model.to(device)
    model.load_state_dict(torch.load(model_path[i], map_location='cuda:0'))
    test_preds += [inference_func(test_loader)]
    
submission = pd.read_csv('./nih_pseudolabels_test.csv')
submission[label_cols] = np.mean(test_preds, axis=0)
submission.to_csv('./nih_pseudolabels_mean.csv', index=False)

# Turn the predictions into labels

You can determine for yourself how confident you would like the model to be, for it to return a positive label.

In [None]:
pseudolabel_means = pd.read_csv('./nih_pseudolabels_mean.csv')

confidence_level = 0.7
def roundAbove(x):
    if x < confidence_level:
        return 0
    else:
        return 1
    
for i in range(len(label_cols)):
    pseudolabel_means[label_cols[i]] = pseudolabel_means[label_cols[i]].map(roundAbove)

pseudolabel_means.to_csv('./nih_pseudolabels.csv', index=False)
pseudolabel_means.head()

# Display some examples

Now we could pass these images with the new pseudolabels as training data to our single model. Let's have a bit of a look first to eyeball the accuracy.

In [None]:
pseudolabels = pd.read_csv('./nih_pseudolabels.csv')
results_dataset = RANZCRDataset(pseudolabels, 'none', transform=transforms_test)

for i in range(100):
    image, label = results_dataset[i]
    np_label = label.numpy()
    ones = np.argwhere(np_label==1)

    labels = []

    if len(ones):
        for i in range(len(ones)):
            idx = ones[i][0]
            labels.append(label_cols[idx])
            plt.imshow(image[0], cmap="gray")
            plt.title(f'label: {labels}')
            plt.show() 