In [115]:
import torch
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [51]:
## Load the master dataset
master_df = pd.read_csv('data/master_df.csv', index_col=0)

## Pre-Processing

Pre-processing the master_df to only contain data that we have images available for in the S3 bucket.  

**Data manipulations performed:**
1. Patients that had medicare or medicaid even in one entry will have medicare or medicaid for all entries. Because this is an association with SDOH features, it's fine to do this  

2. Patients that had multiple ethnicities reported were purged so that we have 1 row for each patient/image

3. NaN outcomes were replaced with 0. Ambigious outcomes were replaced with 0. Probably not the best way to go about it, but we can handle it later.

In [76]:
## Determine all the images we have access to
## Setup boto3
s3 = boto3.resource('s3')
bucket_name = "cs543-final-project"
image_path = "physionet.org/files/mimic-cxr-jpg/2.0.0/images/"

## Get list of files in bucket folder
image_locs = []
s3_bucket = s3.Bucket(bucket_name)
for image_name in s3_bucket.objects.filter(Prefix=image_path):
    image_locs.append(image_name.key[47:-4])


## A lot of our images are duplications. One of the key duplication points is insurance.
# imputation strategy, if at any point the patient used medicare or medicaid, all of the values for that patient
# needs to be medicare or medicaid
master_df.loc[master_df['subject_id'].isin(master_df.loc[master_df['insurance'] == 'Medicare', 'subject_id']), 'insurance'] = 'Medicare' 
master_df.loc[master_df['subject_id'].isin(master_df.loc[master_df['insurance'] == 'Medicaid', 'subject_id']), 'insurance'] = 'Medicaid' 

## Subset our dataset to only the images we have
master_df = master_df.loc[master_df['dicom_id'].isin(image_locs)]
master_df = master_df.drop_duplicates()
master_df = master_df.loc[master_df['dicom_id'].isin(list(master_df.groupby('dicom_id')['dicom_id'].count().sort_values(ascending=False).loc[lambda x: x == 1].index))]
master_df = master_df.reset_index(drop=True)
master_df = master_df.replace(-1, 0).fillna(0)

## Dataset Class and Dataloaders

We're creating the datsetclass in this section.

In [316]:
from torchvision.io import read_image
import torchvision.transforms as transforms
from torchvision.transforms.functional import resize, pad
from PIL import Image

class MIMIC_CXR_Dataset(torch.utils.data.Dataset):
    def __init__(self, annotation_file, outcome_list, image_prefix="data/images", scaling_factor=9, tgrt_img_dims=[512,512], transforms=None):
        self.annotation_file = annotation_file
        self.image_prefix = image_prefix
        self.outcome_list = outcome_list
        self.scaling_factor = scaling_factor
        self.tgrt_img_dims = tgrt_img_dims
        self.transforms = transforms

    def __len__(self):
        return len(self.annotation_file)

    def __getitem__(self, image_name):
        ## Get the patient outcomes
        patient_info = self.annotation_file.loc[self.annotation_file['dicom_id'] == image_name]
        label = patient_info[self.outcome_list].values[:]

        ## Resize the input image by a factor of 9 --> want to preserve the image resolution
        im = read_image(f"{self.image_prefix}/{image_name}.jpg")
        if self.transforms is None:
            ## Read the input image
            dims = np.ceil(np.array(im.shape[1:])/self.scaling_factor).astype(int)
            resized_image = resize(im, list(dims))[0]

            ## Ensure that the image is evenly divisible
            resized_image = resized_image[resized_image.shape[0]%2:, resized_image.shape[1]%2:]
            dims = np.array(resized_image.shape)

            ## Pad the image to the required target image size
            transformed_image = pad(resized_image, list(np.flip(np.ceil((np.array(self.tgrt_img_dims) - dims)/2).astype(int)))).unsqueeze(0)
        else:
            im = im[0].numpy()
#            im = np.dstack([im, im, im]).transpose(2,0,1)
            im = np.dstack([im, im, im])
            transformed_image = self.transforms(Image.fromarray(im))

        return transformed_image, label


In [266]:
outcome_list = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']
annotation_file = master_df[['dicom_id'] + outcome_list]

In [317]:
train_dataset = MIMIC_CXR_Dataset(annotation_file, outcome_list, image_prefix="data/images", transforms=transforms.Compose([
                                                                                                            transforms.RandomHorizontalFlip(),
                                                                                                            transforms.RandomRotation(15),
                                                                                                            transforms.Resize(256),
                                                                                                            transforms.CenterCrop(256),
                                                                                                            transforms.ToTensor(),
                                                                                                            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                                                                                                        ]))


train_dataLoader = torch.utils.data.DataLoader(train_dataset,batch_size=8, sampler=annotation_file['dicom_id'].values[:])

In [321]:
batch_idx, batch = next(enumerate(train_dataLoader))
images, labels = batch

In [323]:
## Load the model
from torchvision import  models
model = models.densenet121(pretrained=True)
## Create the model's classification layer. We don't need a Sigmoid at the end of this because we'll incorporate it into the loss function
model.classifier = torch.nn.Linear(model.classifier.in_features, len(outcome_list)), 

Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /home/ec2-user/.cache/torch/hub/checkpoints/densenet121-a639ec97.pth
100.0%
