# NIH Chest XRay Overlaps

**In this notebook, I show how to use RAPIDS to find duplicate images between the [RANZCR CLiP competition](https://www.kaggle.com/c/ranzcr-clip-catheter-line-classification) training dataset and the [NIH Chest XRays dataset](https://www.kaggle.com/nih-chest-xrays/data).** 

**We simply take a ResNet200D pretrained on ImageNet and create CNN embeddings for the RANZCR train dataset and for the NIH Chest XRay Dataset.**

**These embeddings are then used to train a RAPIDS cuML NearestNeighbors model so that we can manually inspect images whose embeddings are similar between the NIH XRay dataset and the RANZCR CLiP training dataset.**

**The motivation for this notebook comes from [@chrisdeotte](https://www.kaggle.com/cdeotte)'s excellent [kernel](https://www.kaggle.com/cdeotte/rapids-cuml-knn-find-duplicates) from the [Melanoma competition](https://www.kaggle.com/c/siim-isic-melanoma-classification). Please upvote it before upvoting this one (if you decide to). Much of the code is repurposed from [@yasufuminakama](https://www.kaggle.com/yasufuminakama) and his incredible [notebooks](https://www.kaggle.com/yasufuminakama/ranzcr-resnet200d-3-stage-training-step1). Check out his work as well, as he is a very talented coder.** 

**Note that we can do the same procedure to find potential duplicates between the public test images and the NIH XRay dataset. I may do this in a future commit of the notebook.**

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import os
import math
import time
import random
import shutil

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

from matplotlib import pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from albumentations import (
    Compose, Normalize, Resize
    )
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

from torch.cuda.amp import autocast, GradScaler

import warnings 
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    num_workers=4
    model_name='resnet200d_320'
    size=512
    batch_size=128
    seed=416
    target_size=11
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal',
                 'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged', 'NGT - Normal', 
                 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal',
                 'Swan Ganz Catheter Present']
    n_fold=5
    trn_fold=[0] # [0, 1, 2, 3, 4]
    load_embed=True

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

# Data Loading

In [None]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

TRAIN_PATH = '../input/ranzcr-clip-catheter-line-classification/train'
TEST_PATH = '../input/ranzcr-clip-catheter-line-classification/test'
train = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train.csv')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
%%time

NIH_XRAYS_DIRS = [f'../input/data/images_0{str(i).zfill(2)}' for i in range(1, 13)]
NIH_XRAYS = []
for imdir in NIH_XRAYS_DIRS:
    impaths = [imdir + '/images/' + os.listdir(imdir + '/images')[i] for i in range(len(os.listdir(imdir + '/images')))]
    NIH_XRAYS.append(impaths)
    
NIH_XRAYS = np.concatenate(NIH_XRAYS)
print(NIH_XRAYS.shape)

# Datasets

In [None]:
# ====================================================
# Dataset
# ====================================================
class NIHDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['StudyInstanceUID'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.file_names[idx]
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image
    
class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['StudyInstanceUID'].values
        self.labels = df[CFG.target_cols].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TRAIN_PATH}/{file_name}.jpg'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx]).float()
        return image

# Transforms

In [None]:
# ====================================================
# Transforms
# ====================================================
def get_transforms(*, data):
    
    if data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

In [None]:
nih_df = pd.DataFrame(columns=['StudyInstanceUID'],
                    data=NIH_XRAYS)

# Find Duplicates with RAPIDS

In [None]:
import cuml

In [None]:
class FeatureExtractor(nn.Module):
    def __init__(self, model_name='resnet200d_320', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.fc.in_features
        self.model.global_pool = nn.Identity()
        self.model.fc = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(n_features, CFG.target_size)

    def forward(self, x):
        bs = x.size(0)
        features = self.model(x)
        pooled_features = self.pooling(features).view(bs, -1)
        output = self.fc(pooled_features)
        return pooled_features, features
    
def extract(model, test_loader, device):
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    feats = []
    for i, (images) in tk0:
        images = images.to(device)
        with torch.no_grad():
            pooled_feats, features = model(images)
        feats.append(pooled_feats.to('cpu').numpy())
    feats = np.concatenate(feats)
    return feats

In [None]:
if not CFG.load_embed:      
    train_dataset = TrainDataset(train, transform=get_transforms(data='valid'))
    nih_dataset = NIHDataset(nih_df, transform=get_transforms(data='valid'))

    train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=False, 
                             num_workers=CFG.num_workers, pin_memory=True)
    nih_loader = DataLoader(nih_dataset, batch_size=CFG.batch_size, shuffle=False, 
                             num_workers=CFG.num_workers, pin_memory=True)

In [None]:
if not CFG.load_embed:    
    extractor = FeatureExtractor(CFG.model_name, pretrained=True).to(device)
    embed_train = extract(extractor, train_loader, device)
    np.save(f'embed_train', embed_train.astype('float32'))
    embed_nih = extract(extractor, nih_loader, device)
    np.save('embed_nih', embed_nih.astype('float32'))

In [None]:
if CFG.load_embed:   
    embed_train = np.load('../input/ranzcrnihoverlap/embed_train.npy')
    embed_nih = np.load('../input/ranzcrnihoverlap/embed_nih_full.npy')

In [None]:
print(embed_train.shape, embed_nih.shape)

In [None]:
KNN = 3
model = cuml.neighbors.NearestNeighbors(n_neighbors=KNN)
model.fit(embed_nih)
distances, indices = model.kneighbors(embed_train)
mm = np.min(distances,axis=1)

In [None]:
CUTOFF = 5
idx = np.where( (mm<CUTOFF) )[0]
print(f'There are {len(idx)} potential duplicate images that have distance < {CUTOFF}')

In [None]:
for i, k in enumerate(idx):
    
    if i == 16: break
    
    plt.figure(figsize=(10,5))
    
    plt.subplot(1,2,1)
    img = cv2.imread(TRAIN_PATH + '/' + train.loc[k]['StudyInstanceUID'] + '.jpg')
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.title(f"{train.loc[k]['StudyInstanceUID'] + '.jpg'}", fontsize=8)
    plt.imshow(img)


    plt.subplot(1,2,2)
    img = cv2.imread(NIH_XRAYS[indices[k, 0]])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.title(f"{'/'.join([_ for _ in NIH_XRAYS[indices[k, 0]].split('/')[3:]])}", fontsize=8)
    plt.imshow(img)

    plt.show()

In [None]:
#check borderline cases
borderline_idx = np.where( (mm<CUTOFF+1) & (mm>CUTOFF))[0]
print(f'There are {len(borderline_idx)} borderline cases that have distance between {CUTOFF} and {CUTOFF+1}')

In [None]:
#check borderline cases
borderline_idx = np.where( (mm<CUTOFF+1) & (mm>CUTOFF))[0]

for i, k in enumerate(borderline_idx):
    
    if i == 16: break
    
    plt.figure(figsize=(10,5))
    
    plt.subplot(1,2,1)
    img = cv2.imread(TRAIN_PATH + '/' + train.loc[k]['StudyInstanceUID'] + '.jpg')
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.title(f"{train.loc[k]['StudyInstanceUID'] + '.jpg'}", fontsize=8)
    plt.imshow(img)


    plt.subplot(1,2,2)
    img = cv2.imread(NIH_XRAYS[indices[k, 0]])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.title(f"{'/'.join([_ for _ in NIH_XRAYS[indices[k, 0]].split('/')[3:]])}", fontsize=8)
    plt.imshow(img)

    plt.show()

In [None]:
dupes = pd.DataFrame(columns=['ranzcr_path', 'nih_path'])
dupes['ranzcr_path'] = train.loc[idx]['StudyInstanceUID'].values
dupes['nih_path'] = nih_df.loc[indices[idx, 0]]['StudyInstanceUID'].values
dupes.head()

In [None]:
dupes.to_csv('duplicated_paths.csv', index=False)

 **It seems that this approach is not as accurate as I hoped. This is most likely because the images in the NIH XRay dataset are cropped differently than those in our training images. Please comment below if you see a mistake (I rushed the creation of this kernel).**