Torch is already installed in Colab - but you can run for fun to check

In [None]:
#!pip3 install torch

Add the DLH folder in the top-level of your Google Drive
Mount Colab to this location (only run once)

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
os.chdir("drive/My Drive/DLH_Project")

Mounted at /content/drive


In [None]:
%pwd

'/content/drive/.shortcut-targets-by-id/1vmmLQvXIsZR9fm3bw0w0w0S4STu7QfhY/DLH_Project'

Check that we are in the DLH_Project folder

Load the necessary modules


In [None]:
import pandas as pd
import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from skimage import io, transform
import numpy as np

In [None]:
IMG_PATH =  '/content/drive/MyDrive/DLH_Project/images/'
TRAIN_CSV = '/content/drive/My Drive/DLH_Project/images/lateral_multi_train.csv'  #input your train.csv file here
VALID_CSV = '/content/drive/MyDrive/DLH_Project/images/lateral_multi_test.csv'   #input your valid.csv file here


Create custom dataset for loading images from the filepaths specified in the CSV

In [None]:
df_train = pd.read_csv(TRAIN_CSV, header='infer')
df_test = pd.read_csv(VALID_CSV, header='infer')

In [None]:
df_train.shape

(2473, 20)

In [None]:
del df_train['Unnamed: 0']
for k in range(5,19):
  print(df_train.columns[k], '\t\t', df_train.iloc[:,k].sum(), 'positive labels')

No Finding 		 453.0 positive labels
Enlarged Cardiomediastinum 		 129.0 positive labels
Cardiomegaly 		 258.0 positive labels
Lung Opacity 		 841.0 positive labels
Lung Lesion 		 159.0 positive labels
Edema 		 159.0 positive labels
Consolidation 		 121.0 positive labels
Pneumonia 		 110.0 positive labels
Atelectasis 		 269.0 positive labels
Pneumothorax 		 126.0 positive labels
Pleural Effusion 		 652.0 positive labels
Pleural Other 		 89.0 positive labels
Fracture 		 121.0 positive labels
Support Devices 		 649.0 positive labels


In [None]:
df_train.head(n=5)

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small/train/patient00221/study9/...,Female,46,Lateral,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CheXpert-v1.0-small/train/patient09293/study4/...,Male,27,Lateral,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,CheXpert-v1.0-small/train/patient08368/study2/...,Male,67,Lateral,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,CheXpert-v1.0-small/train/patient08817/study1/...,Male,54,Lateral,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CheXpert-v1.0-small/train/patient03903/study1/...,Female,45,Lateral,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
df_test.shape

(315, 20)

In [None]:
del df_test['Unnamed: 0']
for k in range(5,19):
  print(df_test.columns[k], '\t\t', df_test.iloc[:,k].sum(), 'positive labels')

No Finding 		 59.0 positive labels
Enlarged Cardiomediastinum 		 17.0 positive labels
Cardiomegaly 		 33.0 positive labels
Lung Opacity 		 99.0 positive labels
Lung Lesion 		 27.0 positive labels
Edema 		 20.0 positive labels
Consolidation 		 9.0 positive labels
Pneumonia 		 11.0 positive labels
Atelectasis 		 26.0 positive labels
Pneumothorax 		 17.0 positive labels
Pleural Effusion 		 88.0 positive labels
Pleural Other 		 13.0 positive labels
Fracture 		 16.0 positive labels
Support Devices 		 96.0 positive labels


In [None]:
from torch.utils.data import Dataset 

class PneumoniaDataset(Dataset): 
  def __init__(self, csv_file, root_dir, transform = None): 
     
    df = pd.read_csv(csv_file, header='infer')
    del df['Unnamed: 0']
    for i in range(len(df)):
      if df.iloc[i, 1]=='Male':
        df.iloc[i,1] = 0
      else:
        df.iloc[i,1] = 1 
    #print(df.head(5))
    self.data_file = df
    self.max = df.iloc[:,2].max()
    self.min = df.iloc[:,2].min()
    self.root_dir = root_dir  #not being used since full path is given in my code
    self.transform = transform

  def __len__(self):
    return(len(self.data_file))

  def __getitem__(self, idx): 
    path = self.data_file
    #img_path = os.path.join(self.root_dir, self.data_file.iloc[idx, 0])
    
    img_path_L = IMG_PATH + self.data_file.iloc[idx, 0]
    img_path_F = IMG_PATH + self.data_file.iloc[idx, 0].strip('2_lateral.jpg') + '1_frontal.jpg'  #also get the frontal image
    image_l = io.imread(img_path_L)
    image_f = io.imread(img_path_F)

    if self.transform:
      image_l = self.transform(image_l)  #this self.transform is an object of a class transforms.Compose()
      image_f = self.transform(image_f)
    image = torch.cat((image_f, image_l), dim=0) 
    norm_data = (self.data_file.iloc[idx,2] - self.min)/(self.max - self.min)
    ehr_data = [self.data_file.iloc[idx, 1], norm_data]
    ehr_data = np.array(ehr_data, dtype='float')
    ehr_data = torch.tensor(ehr_data, dtype=torch.float32)
    ehr = ehr_data

    y = self.data_file.iloc[idx, 5:19]  #important!!! change y for multilabel classification
    y = np.array(y, dtype='float')
    y = torch.tensor(y, dtype = torch.float32)  #dont forget to change y to tensor; long is required for loss calculation see https://jdhao.github.io/2017/11/15/pytorch-datatype-note/
    return image, ehr, y

In [None]:
'''
dataset = PneumoniaDataset(csv_file=TRAIN_CSV, root_dir="images/", transform=transforms.ToTensor())  #root_dir not being used since full path is given in TRAIN_CSV
# using the ToTensor transform to grab image shape easily

print(len(dataset))
for i in range(1):
  print(i, dataset[i])
  print(i, "image shape: ", dataset[i][0].size())
'''  

'\ndataset = PneumoniaDataset(csv_file=TRAIN_CSV, root_dir="images/", transform=transforms.ToTensor())  #root_dir not being used since full path is given in TRAIN_CSV\n# using the ToTensor transform to grab image shape easily\n\nprint(len(dataset))\nfor i in range(1):\n  print(i, dataset[i])\n  print(i, "image shape: ", dataset[i][0].size())\n'

Dataloader


In [None]:
def load_data(csv_filepath, root_dir):
  img_transform = transforms.Compose([
  transforms.ToTensor(),
  transforms.RandomResizedCrop(224)
  ])
  train_data = PneumoniaDataset(csv_filepath, root_dir, transform = img_transform) 
  train_loader = torch.utils.data.DataLoader(train_data, batch_size = 32, shuffle = True) 
  return train_loader

In [None]:
train_loader = load_data(TRAIN_CSV, root_dir="images/")  #root_dir not being used since full path is given in TRAIN_CSV
valid_loader = load_data(VALID_CSV, root_dir="images/")  #root_dir not being used since full path is given in VALID_CSV

In [None]:
#sanity checks

data = iter(train_loader)
data_batch1 = next(data)
print(data_batch1[1].shape)  #shape looks right
#print(data_batch1[0][0])  #data of 1st sample (1,224,224)


torch.Size([32, 2])


In [None]:
#sanity checks
data = iter(valid_loader)
data_batch1 = next(data)
print(data_batch1[0].shape)  #shape looks right
print(data_batch1[0][0])  #data of 1st sample (1,224,224)


torch.Size([32, 2, 224, 224])
tensor([[[0.1765, 0.1898, 0.2127,  ..., 0.1708, 0.1686, 0.1686],
         [0.1668, 0.1812, 0.2017,  ..., 0.1660, 0.1737, 0.1808],
         [0.1636, 0.1740, 0.1880,  ..., 0.1692, 0.1737, 0.1768],
         ...,
         [0.2745, 0.2747, 0.2656,  ..., 0.2952, 0.2814, 0.2627],
         [0.2260, 0.2371, 0.2451,  ..., 0.2777, 0.2810, 0.2806],
         [0.2235, 0.2311, 0.2425,  ..., 0.2364, 0.2474, 0.2588]],

        [[0.1451, 0.1451, 0.1472,  ..., 0.5853, 0.1023, 0.1036],
         [0.1431, 0.1451, 0.1472,  ..., 0.5517, 0.0804, 0.1007],
         [0.1412, 0.1426, 0.1472,  ..., 0.5555, 0.0836, 0.1007],
         ...,
         [0.8573, 0.8239, 0.8185,  ..., 0.0466, 0.0629, 0.0842],
         [0.8321, 0.8320, 0.8177,  ..., 0.0457, 0.0617, 0.0842],
         [0.8172, 0.8045, 0.8184,  ..., 0.0485, 0.0617, 0.0842]]])


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        self.w = 224  #width and height of image
        self.conv1 = nn.Conv2d(2, 6, 5, 2)  #(input=2, output, kernel size, stride)
        self.w2 = (self.w - 5)//2 + 1
        self.pool = nn.MaxPool2d(2, 2)  #(kernel size, stride)
        self.w2p = (self.w2 - 2)//2 + 1
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.w3 = (self.w2p - 5)//1 + 1
        self.w3p = (self.w3 - 2)//2 +1
        self.fc1 = nn.Linear(16 * self.w3p * self.w3p + 2, 120) 
        self.dropout = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(120, 60)
        self.fc3 = nn.Linear(60, 14)  #change output to 14 
        self.sigmoid = nn.Sigmoid()
        
        

    def forward(self, image, ehr):
        #input is of shape (batch_size=32, 1, 224, 224) if you did the dataloader right
        x = image
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(-1, 16 * self.w3p * self.w3p)  #check
        x = torch.cat((x, ehr), dim=1)  #cat the ehr data with conv data
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.sigmoid(x)  #output will be (batchsize, 14)

        #print('output shape = ', x.shape)
        return x

model = SimpleCNN()

In [None]:
#print(model)

In [None]:
#define optimizer and loss function
import torch.optim as optim
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr= 0.001)

In [None]:

def train_model(model, train_loader, valid_loader, n_epoch, optimizer=optimizer, criterion=criterion):
    import torch.optim as optim

    model.train() # prep model for training
    
    
    for epoch in range(n_epoch):
        curr_epoch_loss = []
        for image, ehr, target in train_loader:
                       
            optimizer.zero_grad()

            y_hat = model(image, ehr)  #forward pass
            y_hat = torch.squeeze(y_hat, dim=1)  #shape is (batch#, 14)
            #print('y_hat shape = ', y_hat.shape)
            #y_hat = torch.tensor(y_hat, dtype=torch.float32, requires_grad=True)  #chech on this: what is requires_grad=True
            
            #print('target dtype = ',target.type())
            #print('yhat dtype = ',y_hat.type())
            loss = criterion(y_hat, target)  #loss calculation          
            
            """ backward pass """
            loss.backward()
            """ optimization """
            optimizer.step()   
            
            curr_epoch_loss.append(loss.cpu().data.numpy()) 
        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
        #evaluate(model, valid_loader)
    return model


In [None]:
from sklearn.metrics import *

def classification_metrics02(Y_pred, Y_true):
    #input: both Y_pred (predicted probabilites), Y_true (truth labels) are numpy arrays (n,14) for multilabel classification
    #output: dataframe shape (14, 5) -> 14 labels, 5 metrics 

    Y_pred_prob = Y_pred        
    Y_pred_label = Y_pred > 0.5  #boolean dtype
    Y_true = Y_true > 0          #boolean dtype

    #print(Y_pred_prob)
    #print(Y_pred_label)
    #print(Y_true)

    Y_pred_prob = np.transpose(Y_pred_prob)
    Y_pred_label = np.transpose(Y_pred_label)
    Y_true = np.transpose(Y_true)
    num_labels = Y_true.shape[0]
    results = []
 
    for i in range(num_labels):
      scores = []
      #to prevent auc throwing an error, assign to 'nan' for the condition below:
      if Y_true[i,:].sum() == 0 or Y_true[i,:].sum() == Y_true.shape[1]: 
        auc = np.NAN
      else:
        auc = roc_auc_score(Y_true[i,:], Y_pred_prob[i,:])  

      acc = accuracy_score(Y_true[i,:], Y_pred_label[i,:])
      precision = precision_score(Y_true[i,:], Y_pred_label[i,:], average='binary')
      recall = recall_score(Y_true[i,:], Y_pred_label[i,:], average='binary')
      f1score = f1_score(Y_true[i,:], Y_pred_label[i,:], average='binary')

      scores = [acc, auc, precision, recall, f1score]
      results.append(scores)
    #print(results)
    results = np.array(results)
    metrics_df = pd.DataFrame(results,columns=['accuracy', 'auc', 'precision', 'recall', 'f1'])
    return metrics_df

#input: model, loader
def evaluate02(model, valid_loader):
    model.eval()
    all_y_true = torch.FloatTensor()  #this will accumulate all batches; is it faster to use numpy matrix?
    all_y_hat = torch.FloatTensor()   #this will accumulate all batches
    
    for image, ehr, y_val in valid_loader:
        y_hat = model(image, ehr)
        # y_hat shape is [batch size, 14] 
        #y_hat = y_hat.view(y_hat.shape[0])  
        
        all_y_hat = torch.cat((all_y_hat, y_hat.to('cpu').float()), dim=0)
        all_y_true = torch.cat((all_y_true,  y_val.to('cpu').float()), dim=0) 
    
    all_y_hat = all_y_hat.detach().numpy()
    all_y_true = all_y_true.detach().numpy()
    #print(type(all_y_true))   
    #print(type(all_y_hat))
    metrics = classification_metrics02(all_y_hat, all_y_true)
    print(metrics)
    #print(f"acc: {acc:.3f}, precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")
    return 

In [None]:
train_model(model, train_loader, valid_loader, n_epoch=5, optimizer=optimizer, criterion=criterion)

Epoch 0: curr_epoch_loss=0.4172011613845825
Epoch 1: curr_epoch_loss=0.35445281863212585
Epoch 2: curr_epoch_loss=0.34695181250572205
Epoch 3: curr_epoch_loss=0.3420909345149994
Epoch 4: curr_epoch_loss=0.34010377526283264


SimpleCNN(
  (conv1): Conv2d(2, 6, kernel_size=(5, 5), stride=(2, 2))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=10002, out_features=120, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (fc3): Linear(in_features=60, out_features=14, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
evaluate02(model, valid_loader)

    accuracy       auc  precision  recall   f1
0   0.812698  0.461666        0.0     0.0  0.0
1   0.946032  0.646072        0.0     0.0  0.0
2   0.895238  0.528906        0.0     0.0  0.0
3   0.685714  0.562103        0.0     0.0  0.0
4   0.914286  0.458719        0.0     0.0  0.0
5   0.936508  0.546610        0.0     0.0  0.0
6   0.971429  0.606391        0.0     0.0  0.0
7   0.965079  0.471292        0.0     0.0  0.0
8   0.917460  0.533804        0.0     0.0  0.0
9   0.946032  0.676076        0.0     0.0  0.0
10  0.720635  0.652583        0.0     0.0  0.0
11  0.958730  0.408049        0.0     0.0  0.0
12  0.949206  0.479933        0.0     0.0  0.0
13  0.695238  0.592894        0.0     0.0  0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#to save your model
#torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/simpleCNNplus_multi01.pth')

model_multi00.pth - 20 epochs loss = 0.351   



In [None]:
#to reload your model
modelmulti01 = SimpleCNN()  #create instance of your chosen model
modelmulti01.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/simpleCNNplus_multi01.pth'))


<All keys matched successfully>

In [None]:
#run evaluate() on your saved model here to get statistics
evaluate02(modelmulti01, valid_loader)

    accuracy       auc  precision    recall        f1
0   0.809524  0.683925    0.00000  0.000000  0.000000
1   0.946032  0.644493    0.00000  0.000000  0.000000
2   0.895238  0.732646    0.00000  0.000000  0.000000
3   0.673016  0.570707    0.00000  0.000000  0.000000
4   0.914286  0.436986    0.00000  0.000000  0.000000
5   0.936508  0.623390    0.00000  0.000000  0.000000
6   0.971429  0.855846    0.00000  0.000000  0.000000
7   0.965079  0.477871    0.00000  0.000000  0.000000
8   0.917460  0.669417    0.00000  0.000000  0.000000
9   0.946032  0.663640    0.00000  0.000000  0.000000
10  0.742857  0.743342    0.62069  0.204545  0.307692
11  0.958730  0.484972    0.00000  0.000000  0.000000
12  0.949206  0.527174    0.00000  0.000000  0.000000
13  0.695238  0.581240    0.00000  0.000000  0.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
