Torch is already installed in Colab - but you can run for fun to check

In [None]:
#!pip3 install torch

Add the DLH folder in the top-level of your Google Drive
Mount Colab to this location (only run once)

In [None]:

import os
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
os.chdir("drive/My Drive/DLH_Project")

Mounted at /content/drive


In [None]:
%pwd  #make sure you are in the DLH_Project file

'/content/drive/.shortcut-targets-by-id/1vmmLQvXIsZR9fm3bw0w0w0S4STu7QfhY/DLH_Project'

Check that we are in the DLH_Project folder

Load the necessary modules


In [None]:
import pandas as pd
import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from skimage import io, transform
import numpy as np

In [None]:
torch.__version__

'1.8.1+cu101'

In [None]:
IMG_PATH =  '/content/drive/MyDrive/DLH_Project/images/'
TRAIN_CSV = '/content/drive/MyDrive/DLH_Project/images/lateral_train.csv'  #input your train.csv file here;  this data set does not contain same n as simple cnn model due to not enough images w lateral views
VALID_CSV = '/content/drive/MyDrive/DLH_Project/images/lateral_test2.csv'   #input your valid.csv file here; this data set has lateral views and was filtered to have the same number of pos and neg labels as simple cnn, resnet, densenet model for consistency 

In [None]:
#sanity check of training data file - not required to run
df_train = pd.read_csv(TRAIN_CSV, header='infer')
del df_train['Unnamed: 0'] #get rid of extraneous column
print(df_train.shape)
df_train.head(n=5)

(1082, 19)


Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small/train/patient00044/study6/...,Female,49,Lateral,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,CheXpert-v1.0-small/train/patient07169/study1/...,Female,44,Lateral,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CheXpert-v1.0-small/train/patient00901/study1/...,Female,29,Lateral,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CheXpert-v1.0-small/train/patient04957/study3/...,Male,52,Lateral,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,CheXpert-v1.0-small/train/patient00344/study1/...,Male,54,Lateral,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_train.iloc[:,12].sum()  #number of positive labels pneumonia

270.0

In [None]:
#sanity check of validation file - not required to run
df_test = pd.read_csv(VALID_CSV, header='infer')
del df_test['Unnamed: 0']
print(df_test.shape)
df_test.head(n=5)

(121, 19)


Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small/train/patient08164/study1/...,Female,55,Lateral,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CheXpert-v1.0-small/train/patient05791/study3/...,Male,31,Lateral,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,CheXpert-v1.0-small/train/patient06806/study1/...,Male,72,Lateral,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CheXpert-v1.0-small/train/patient01448/study10...,Female,56,Lateral,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CheXpert-v1.0-small/train/patient09705/study1/...,Male,37,Lateral,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_test.iloc[:,12].sum()  #number of positive labels 

79.0

Create custom dataset for loading images from the filepaths specified in the CSV

In [None]:
from torch.utils.data import Dataset 

class PneumoniaDataset(Dataset): 
  def __init__(self, csv_file, root_dir, transform = None): 

    df = pd.read_csv(csv_file, header='infer')
    del df['Unnamed: 0']  #get rid of unecessary column
    for i in range(len(df)):
      if df.iloc[i, 1]=='Male':
        df.iloc[i,1] = 0
      else:
        df.iloc[i,1] = 1  
    #df = self.filterDF(df, n=None)
    self.data_file = df
    self.max = df.iloc[:,2].max()
    self.min = df.iloc[:,2].min()
    self.root_dir = root_dir  #not being used since full path is given
    self.transform = transform
    
  def __len__(self):
    return(len(self.data_file))

  def __getitem__(self, idx): 
    path = self.data_file
    
    img_path_L = IMG_PATH + self.data_file.iloc[idx, 0]
    img_path_F = IMG_PATH + self.data_file.iloc[idx, 0].strip('2_lateral.jpg') + '1_frontal.jpg'  #also get the frontal image

    image_l = io.imread(img_path_L)
    image_f = io.imread(img_path_F)
    if self.transform:
      image_l = self.transform(image_l)  #this self.transform is an object of a class transforms.Compose()
      image_f = self.transform(image_f)
    image = torch.cat((image_f, image_l), dim=0) 
    norm_data = (self.data_file.iloc[idx,2] - self.min)/(self.max - self.min)
    ehr_data = [self.data_file.iloc[idx, 1], norm_data]
    ehr_data = np.array(ehr_data, dtype='float')
    ehr_data = torch.tensor(ehr_data, dtype=torch.float32)
    ehr = ehr_data
    y = self.data_file.iloc[idx, 12]  #important !!! column 12 is pneumonia
    y = np.array(y, dtype='float')
    y = torch.tensor(y, dtype = torch.float32)  #dont forget to change y to tensor; long is required for loss calculation see https://jdhao.github.io/2017/11/15/pytorch-datatype-note/
    return image, ehr, y

In [None]:
#sanity checks to see output of Dataset - not required to run
dataset = PneumoniaDataset(csv_file=TRAIN_CSV, root_dir="images/", transform=transforms.ToTensor())  #root_dir not being used since full path is given in TRAIN_CSV
# using the ToTensor transform to grab image shape easily

print(len(dataset))
for i in range(1):
  print(i, dataset[i])
  print(i, "image shape: ", dataset[i][0].size())
  print(i, "ehr shape: ", dataset[i][1].size())
  print(i, "y shape: ", dataset[i][2].size())

1082
0 (tensor([[[0.1882, 0.1765, 0.1765,  ..., 0.1529, 0.1725, 0.2353],
         [0.2471, 0.2275, 0.2235,  ..., 0.0863, 0.0941, 0.1451],
         [0.1373, 0.1098, 0.0941,  ..., 0.0588, 0.0627, 0.0980],
         ...,
         [0.5176, 0.4745, 0.4078,  ..., 0.4784, 0.6275, 0.5608],
         [0.5490, 0.4980, 0.5294,  ..., 0.5608, 0.6353, 0.5373],
         [0.6039, 0.5647, 0.5294,  ..., 0.6471, 0.6431, 0.6706]],

        [[0.3529, 0.3529, 0.3569,  ..., 0.1059, 0.1255, 0.1451],
         [0.3725, 0.3490, 0.3373,  ..., 0.0941, 0.1059, 0.1176],
         [0.3373, 0.3059, 0.2824,  ..., 0.0902, 0.0941, 0.0980],
         ...,
         [0.3608, 0.3451, 0.3216,  ..., 0.0000, 0.0000, 0.0000],
         [0.4039, 0.3765, 0.3373,  ..., 0.0000, 0.0000, 0.0000],
         [0.4824, 0.4314, 0.3765,  ..., 0.0000, 0.0000, 0.0000]]]), tensor([1.0000, 0.4306]), tensor(0.))
0 image shape:  torch.Size([2, 320, 320])
0 ehr shape:  torch.Size([2])
0 y shape:  torch.Size([])


Dataloader


In [None]:
def load_data(csv_filepath, root_dir):
  img_transform = transforms.Compose([
  transforms.ToTensor(),
  transforms.RandomResizedCrop(224)
  ])
  train_data = PneumoniaDataset(csv_filepath, root_dir, transform = img_transform) 
  train_loader = torch.utils.data.DataLoader(train_data, batch_size = 32, shuffle = True) 
  return train_loader

In [None]:
train_loader = load_data(TRAIN_CSV, root_dir="images/")  #root_dir not being used since full path is given in TRAIN_CSV
valid_loader = load_data(VALID_CSV, root_dir="images/")  #root_dir not being used since full path is given in VALID_CSV

In [None]:
#sanity checks train_loader - not required to run
data = iter(train_loader)
data_batch1 = next(data)
print(data_batch1[0].shape)  #shape looks right
#print(data_batch1[0][0])  #data of 1st sample (2,224,224)

torch.Size([32, 2, 224, 224])


In [None]:
#sanity checks valid_loader - not required to run
data = iter(valid_loader)
data_batch1 = next(data)
print(data_batch1[1].shape)  #shape looks right (32,2)
#print(data_batch1[1])  #data of 1st sample (2,224,224)

torch.Size([32, 2])


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        self.w = 224  #width and height of image
        self.conv1 = nn.Conv2d(2, 16, 5, 2)  #(input=2, output, kernel size, stride)
        self.pool = nn.MaxPool2d(3,1)  #(kernel size, stride)
        self.norm2d_1 = nn.BatchNorm2d(16)  #batch normalization to prevent high gradients
        self.conv2 = nn.Conv2d(16, 32, 5)
        self.norm2d_2 = nn.BatchNorm2d(32)  
        self.conv3 = nn.Conv2d(32, 64, 5)
        self.norm2d_3 = nn.BatchNorm2d(64)  
        self.fc1 = nn.Linear(64 * 96 * 96 + 2, 120) #check input; include ehr data here!!!
        self.norm1d_1 = nn.BatchNorm1d(120)
        self.dropout = nn.Dropout(p=0.5)  #drop out layer to prevent overfitting
        self.fc2 = nn.Linear(120, 60)
        self.norm1d_2 = nn.BatchNorm1d(60)
        self.fc3 = nn.Linear(60, 1)  #change output to one for binary classification
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, image, ehr):   
        
        x = F.leaky_relu(self.conv1(image))
        x = self.norm2d_1(x)
        x = self.pool(x)
        x = F.leaky_relu(self.conv2(x))
        x = self.norm2d_2(x)
        x = self.pool(x)
        x = F.leaky_relu(self.conv3(x))
        x = self.norm2d_3(x)
        x = self.pool(x)
        x = x.view(-1, 64*96*96)  #flatten and pass to nn.Linear     
        x= torch.cat((x, ehr), dim=1) #check! add ehr data here
        x = F.leaky_relu(self.fc1(x))
        x = self.norm1d_1(x)
        x = self.dropout(x)
        x = F.leaky_relu(self.fc2(x))
        x = self.norm1d_2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x  



In [None]:
model = SimpleCNN()
print(model)

SimpleCNN(
  (conv1): Conv2d(2, 16, kernel_size=(5, 5), stride=(2, 2))
  (pool): MaxPool2d(kernel_size=3, stride=1, padding=0, dilation=1, ceil_mode=False)
  (norm2d_1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
  (norm2d_2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
  (norm2d_3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=589826, out_features=120, bias=True)
  (norm1d_1): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (norm1d_2): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=60, out_features=1, bias=True)
  (sigmoid): Sigmoid()


In [None]:
#define the optimizer and loss function
import torch.optim as optim
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr= 0.001)

In [None]:

def train_model(model, train_loader, valid_loader, n_epoch, optimizer=optimizer, criterion=criterion):
    import torch.optim as optim

    model.train() # prep model for training
    
    
    for epoch in range(n_epoch):
        curr_epoch_loss = []
        for image, ehr, target in train_loader:  #image shape (32, 2, 224, 224), ehr (32, 2)
            
            optimizer.zero_grad()

            y_hat = model(image, ehr)  #forward pass; model is of class SimpleCNN
            y_hat = torch.squeeze(y_hat, dim=1)

            loss = criterion(y_hat, target)  #loss calculation          
            
            """ backward pass """
            loss.backward()
            """ optimization """
            optimizer.step()   
            
            curr_epoch_loss.append(loss.cpu().data.numpy()) 
        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
        evaluate(model, valid_loader)
    return model


In [None]:
from sklearn.metrics import *

#input: Y_pred,Y_true
#output: accuracy, auc, precision, recall, f1-score
def classification_metrics(Y_pred, Y_true):

    Y_pred_prob = Y_pred
    Y_pred_label = Y_pred > 0.5  #boolean dtype
    Y_true = Y_true > 0          #boolean dtype
    #print(Y_pred_prob)
    #print(Y_pred_label)
    #print(Y_true)
    acc, auc, precision, recall, f1score = accuracy_score(Y_true, Y_pred_label), \
                                           roc_auc_score(Y_true, Y_pred), \
                                           precision_score(Y_true, Y_pred_label), \
                                           recall_score(Y_true, Y_pred_label), \
                                           f1_score(Y_true, Y_pred_label)
    return acc, auc, precision, recall, f1score


#input: model, loader
def evaluate(model, valid_loader):
    model.eval()
    all_y_true = torch.FloatTensor()  #this will accumulate all batches
    all_y_hat = torch.FloatTensor()   #this will accumulate all batches
    
    for image, ehr, y_val in valid_loader:
        y_hat = model(image, ehr)
        # convert shape from [batch size, 1] to [batch size]
        y_hat = y_hat.view(y_hat.shape[0])

        all_y_hat = torch.cat((all_y_hat, y_hat.to('cpu').float()), dim=0)
        all_y_true = torch.cat((all_y_true,  y_val.to('cpu').float()), dim=0) 
    #all_y_hat[85] = 0.7  #just to test precision is working
    all_y_hat = all_y_hat.detach().numpy()
    all_y_true = all_y_true.detach().numpy()
    #print(type(all_y_true))   
    #print(type(all_y_hat))
    acc, auc, precision, recall, f1 = classification_metrics(all_y_hat, all_y_true)
    print(f"acc: {acc:.3f}, auc: {auc:.3f}, precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")
    return acc

In [None]:
train_model(model, train_loader, valid_loader, n_epoch=10, optimizer=optimizer, criterion=criterion)

Epoch 0: curr_epoch_loss=0.7031439542770386
acc: 0.421, auc: 0.480, precision: 0.737, recall: 0.177, f1: 0.286
Epoch 1: curr_epoch_loss=0.5766517519950867


  _warn_prf(average, modifier, msg_start, len(result))


acc: 0.347, auc: 0.460, precision: 0.000, recall: 0.000, f1: 0.000
Epoch 2: curr_epoch_loss=0.5717049837112427


  _warn_prf(average, modifier, msg_start, len(result))


acc: 0.347, auc: 0.594, precision: 0.000, recall: 0.000, f1: 0.000
Epoch 3: curr_epoch_loss=0.5649038553237915


  _warn_prf(average, modifier, msg_start, len(result))


acc: 0.347, auc: 0.559, precision: 0.000, recall: 0.000, f1: 0.000
Epoch 4: curr_epoch_loss=0.5671100616455078


  _warn_prf(average, modifier, msg_start, len(result))


acc: 0.347, auc: 0.602, precision: 0.000, recall: 0.000, f1: 0.000
Epoch 5: curr_epoch_loss=0.5651189684867859


  _warn_prf(average, modifier, msg_start, len(result))


acc: 0.347, auc: 0.586, precision: 0.000, recall: 0.000, f1: 0.000
Epoch 6: curr_epoch_loss=0.562197744846344


  _warn_prf(average, modifier, msg_start, len(result))


acc: 0.347, auc: 0.471, precision: 0.000, recall: 0.000, f1: 0.000
Epoch 7: curr_epoch_loss=0.5610071420669556


  _warn_prf(average, modifier, msg_start, len(result))


acc: 0.347, auc: 0.533, precision: 0.000, recall: 0.000, f1: 0.000
Epoch 8: curr_epoch_loss=0.5651713013648987


  _warn_prf(average, modifier, msg_start, len(result))


acc: 0.347, auc: 0.575, precision: 0.000, recall: 0.000, f1: 0.000
Epoch 9: curr_epoch_loss=0.5636739134788513
acc: 0.347, auc: 0.536, precision: 0.000, recall: 0.000, f1: 0.000


  _warn_prf(average, modifier, msg_start, len(result))


SimpleCNN(
  (conv1): Conv2d(2, 16, kernel_size=(5, 5), stride=(2, 2))
  (pool): MaxPool2d(kernel_size=3, stride=1, padding=0, dilation=1, ceil_mode=False)
  (norm2d_1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
  (norm2d_2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
  (norm2d_3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=589826, out_features=120, bias=True)
  (norm1d_1): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (norm1d_2): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=60, out_features=1, bias=True)
  (sigmoid): Sigmoid()


In [None]:
#to save your model
#torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/CNN_plus1.pth')

In [None]:
#to reload your model
#model0 = SimpleCNN()
#model0.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/CNN_plus1.pth'))
