### Analysis of ChestX-14 data and Convert to PyTorch ImageFolder directory structure

Note: For this to work, you would need to run this script after downloading & extracting the CheXpert-v1.0-small dataset. Ideally, there's no need to run this anymore - but if we need additional data for some reason (or to run new experiments), it might be easy to start with this.

In [38]:
chest_xray_df = pd.read_csv('CheXpert-v1.0-small/train.csv')

In [85]:
print(f"CancerDetected: {(chest_xray_df['Lung Lesion'] == 1).sum()}\n \
      Unreliable/Unsure(labelled 0): {(chest_xray_df['Lung Lesion'] == 0).sum()}\n \
      No Cancer(labelled -1): {(chest_xray_df['Lung Lesion'] == -1).sum()}")

CancerDetected: 9186
       Unreliable/Unsure(labelled 0): 1270
       No Cancer(labelled -1): 1488


### Move positive training instance images to train/malignant

In [77]:
cancer_images = chest_xray_df[chest_xray_df['Lung Lesion'] == 1]
cancer_images = cancer_images[['Path', 'Lung Lesion']]
image_path = cancer_images.iloc[i]['Path'].split('/')

'_'.join([x for x in image_path[-3:]])

'patient00019_study3_view1_frontal.jpg'

In [89]:
new_image_root_dir = 'data/train/malignant/'

for i in range(len(cancer_images)):
    image_path = cancer_images.iloc[i]['Path'].split('/')
    new_image_name = new_image_root_dir + '_'.join([x for x in image_path[-3:]])
    
    im = io.imread(cancer_images.iloc[i]['Path'])
    io.imsave(new_image_name, im)

### Move negative training instance images to train/benign

In [91]:
no_cancer_images = chest_xray_df[chest_xray_df['Lung Lesion'] == -1]
no_cancer_images = no_cancer_images[['Path', 'Lung Lesion']]
len(no_cancer_images)

1488

In [92]:
new_image_root_dir = 'data/train/benign/'

for i in range(len(no_cancer_images)):
    image_path = no_cancer_images.iloc[i]['Path'].split('/')
    new_image_name = new_image_root_dir + '_'.join([x for x in image_path[-3:]])
    
    im = io.imread(no_cancer_images.iloc[i]['Path'])
    io.imsave(new_image_name, im)

### Move "No finding" instances to train/benign

In [98]:
chest_xray_df = pd.read_csv('CheXpert-v1.0-small/train.csv')
print(f"NoFindings: {(chest_xray_df['No Finding'] == 1).sum()}\n")

NoFindings: 22381



In [100]:
# Get df 
no_findings_df = chest_xray_df[chest_xray_df['No Finding'] == 1]
no_findings_df = no_findings_df[['Path']]
len(no_findings_df)

22381

In [None]:
def move_image_in_specific_row_to_folder(no_findings_df, i, image_root_dir):
    image_path = no_findings_df.iloc[i]['Path'].split('/')
    new_image_name = image_root_dir + '_'.join([x for x in image_path[-3:]])
    
    im = io.imread(no_findings_df.iloc[i]['Path'])
    io.imsave(new_image_name, im)
    

i = 0


image_root_dir = 'data/train/benign/'
while i < 5000:
    # Move to train/benign
    move_image_in_specific_row_to_folder(no_findings_df, i, image_root_dir)
    i += 1
    
image_root_dir = 'data/val/benign/'
while i <6500:
    # Move to val/benign
    move_image_in_specific_row_to_folder(no_findings_df, i, image_root_dir)
    i += 1
    
image_root_dir = 'data/test/benign/'
while i < 8000:
    # Move to test/benign
    move_image_in_specific_row_to_folder(no_findings_df, i, image_root_dir)
    i += 1


### Repeat for images in original val folder (Note: Using this as our independent test folder as we have no access to the hosted independed test folder on the website)

In [93]:
chest_xray_df = pd.read_csv('CheXpert-v1.0-small/valid.csv')
print(f"CancerDetected: {(chest_xray_df['Lung Lesion'] == 1).sum()}\n \
      Unreliable/Unsure(labelled 0): {(chest_xray_df['Lung Lesion'] == 0).sum()}\n \
      No Cancer(labelled -1): {(chest_xray_df['Lung Lesion'] == -1).sum()}")

CancerDetected: 1
       Unreliable/Unsure(labelled 0): 233
       No Cancer(labelled -1): 0


#### Comments:

Sree:
Looks like there's just one positive image here. My initial plan was to use all +ve and -ve from here as our independent test set. But now we'll have to choose from one of these two options:
1. Split the data in train into test and val (that has ~9k cancer images. So we're looking at 6k,1.5k, 1.5k ish - which might work alright.
2. We switch to using the larger dataset :/

#### Tutorial on writing a custom dataset - (Ignore) (I decided it was easier to use ImageFolder from pytorch as we already have scripts that train using this; So the script above does the conversion of our data to the expected format for ImageFolder)

In [None]:
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import numpy as np
import matplotlib.pyplot as plt
from skimage import io, transform

# Interactive mode
plt.ion()

### Tutorial - Writing custom Dataset 

# Constants
faces_dir = 'faces/'

def show_landmarks(image, landmarks):
    """Show image with landmarks"""
    plt.imshow(image)
    plt.scatter(landmarks[:, 0], landmarks[:, 1], s=10, marker='.', c='r')
    plt.pause(0.001)  # pause a bit so that plots are updated

landmarks_df = pd.read_csv('faces/face_landmarks.csv')
landmarks_df.head(3)
image_names = landmarks_df['image_name']
landmarks = landmarks_df.iloc[:, 1:].as_matrix()
landmarks = landmarks.reshape(-1, 2)
print(landmarks.shape)
show_landmarks(plt.imread(faces_dir + image_names.iloc[0]), landmarks[:68]) # 68 feats per image

class FaceLandmarksDataset(Dataset):
    """
    Face Landmarks Dataset. 
    """
    
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): path to csv file containing annotations
            root_dir (string): root directory containing all images
            transform (callable, optional): Optional transform to be applied on sample
        """
        self.landmarks_df = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        
    def __len__(self, ):
        # return the size of your dataset
        return len(self.landmarks_df)

    def __getitem__(self, idx):
        # dataset[i] -> returns ith sample
        if torch.is_tensor(idx): 
            idx = idx.tolist()
            
        img_name = os.path.join(self.root_dir, self.landmarks_df.iloc[idx, 0])
        image = io.imread(img_name)
        
        landmarks = self.landmarks_df.iloc[idx, 1:]
        landmarks = np.array([landmarks])
        landmarks = landmarks.astype('float').reshape(-1, 2)
        sample = {'image': image, 'landmarks': landmarks}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample
    
    
    

face_dataset = FaceLandmarksDataset(csv_file='faces/face_landmarks.csv', root_dir='faces/')

fig = plt.figure()

for i in range(len(face_dataset)):
    sample = face_dataset[i]    
    print(i, sample['image'].shape, sample['landmarks'].shape)

    ax = plt.subplot(1, 4, i+1)
    plt.tight_layout()
    
    ax.set_title(f'Sample {i}:')
    ax.axis('off')
#     print(**sample)
    show_landmarks(**sample)
    
    if i == 3:
        plt.show()
        break

class Rescale(object):
    """
    Rescale the image in a sample to a given size
    """
    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size
        
    def __call__(self, sample):
        image, landmarks = sample['image'], sample['landmarks']

        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)

        img = transform.resize(image, (new_h, new_w))

        # h and w are swapped for landmarks because for images,
        # x and y axes are axis 1 and 0 respectively
        landmarks = landmarks * [new_w / w, new_h / h]

        return {'image': img, 'landmarks': landmarks}
        