In [1]:
import torch
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from CS543_Dataset import MIMIC_CXR_Dataset
from torch.utils.tensorboard import SummaryWriter
import tqdm
pd.set_option('display.max_columns', None)

In [2]:
## Load the master dataset
master_df = pd.read_csv('data/master_df.csv', index_col=0)

In [3]:
## Determine all the images we have access to
## Setup boto3
s3 = boto3.resource('s3')
bucket_name = "cs543-final-project"
image_path = "physionet.org/files/mimic-cxr-jpg/2.0.0/images/"

## Get list of files in bucket folder
image_locs = []
s3_bucket = s3.Bucket(bucket_name)
for image_name in s3_bucket.objects.filter(Prefix=image_path):
    image_locs.append(image_name.key[47:-4])


## A lot of our images are duplications. One of the key duplication points is insurance.
# imputation strategy, if at any point the patient used medicare or medicaid, all of the values for that patient
# needs to be medicare or medicaid
master_df.loc[master_df['subject_id'].isin(master_df.loc[master_df['insurance'] == 'Medicare', 'subject_id']), 'insurance'] = 'Medicare' 
master_df.loc[master_df['subject_id'].isin(master_df.loc[master_df['insurance'] == 'Medicaid', 'subject_id']), 'insurance'] = 'Medicaid' 

## Subset our dataset to only the images we have
master_df = master_df.loc[master_df['dicom_id'].isin(image_locs)]
master_df = master_df.drop_duplicates()
master_df = master_df.loc[master_df['dicom_id'].isin(list(master_df.groupby('dicom_id')['dicom_id'].count().sort_values(ascending=False).loc[lambda x: x == 1].index))]
master_df = master_df.reset_index(drop=True)
master_df = master_df.replace(-1, 0).fillna(0)

In [4]:
outcome_list = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']

## Load the model
Here we're loading the densenet model

In [5]:
## Load the model
from torchvision import  models
model = models.densenet121(pretrained=True)
## Create the model's classification layer. We don't need a Sigmoid at the end of this because we'll incorporate it into the loss function
model.classifier = torch.nn.Linear(model.classifier.in_features, len(outcome_list))

## Specificy Training Parameters

In [6]:
batch_size = 32
num_epochs = 5
learning_rate = 1e-3
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

## Specify Training and Testing Datasets

In [7]:
from sklearn.model_selection import train_test_split
train_images, val_images = train_test_split(np.unique(master_df['dicom_id'].values[:]), test_size=.2)
all_df = master_df[['dicom_id'] + outcome_list]
train_df = all_df.loc[all_df['dicom_id'].isin(train_images)].reset_index(drop=True)
val_df = all_df.loc[all_df['dicom_id'].isin(val_images)].reset_index(drop=True)

In [8]:
import torchvision.transforms as transforms
train_dataset = MIMIC_CXR_Dataset(train_df, outcome_list, image_prefix="data/images", transforms=transforms.Compose([
                                                                                                            transforms.RandomHorizontalFlip(),
                                                                                                            transforms.RandomRotation(15),
                                                                                                            transforms.Resize(256),
                                                                                                            transforms.CenterCrop(256),
                                                                                                            transforms.ToTensor(),
                                                                                                            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                                                                                                        ]))


train_dataLoader = torch.utils.data.DataLoader(train_dataset,batch_size=8, sampler=np.unique(train_df['dicom_id'].values[:]))

In [9]:
def train_loop(dataloader, model, loss_fn, optimizer, epoch, summaryWriter=None):
    curr_loss = 0
    size = len(dataloader.dataset)
    step = epoch*size
    model.train()
    pbar = tqdm.tqdm(enumerate(dataloader), total=size)
    for batch_idx, batch in pbar:
        pbar.set_description("Current Loss: %s" % curr_loss)
        ## Extract the inputs
        images, labels = batch

        ## Pass images into the model
        outputs = model(images)

        ## Calculate the loss
        loss = loss_fn(outputs, labels)

        ## Loss value
        curr_loss = loss.item()

        ## Write to tensorboard
        if summaryWriter is not None:
            summaryWriter.add_scalar('Loss/train', curr_loss, step)

        ## Backwards pass
        loss.backward()

        ## Optimizer step
        optimizer.step()

        ## Go to the next step
        step = step + 1



In [10]:
writer = SummaryWriter()
for epoch in range(num_epochs):
    train_loop(train_dataLoader, model, loss_fn, optimizer, epoch, summaryWriter=writer)
    print(f"Finished Epoch: {epoch}")

Current Loss: 0.4819687738906243:   0%|          | 28/10984 [02:02<12:05:30,  3.97s/it] 

In [None]:
## Load the master dataset
master_df = pd.read_csv('data/master_df.csv', index_col=0)